feat(docker/Metax): add metax dockerfile and its requirements for ms-swift 4.1.x (#1689)

This commit is contained in:
dwd
2026-05-06 20:57:17 +08:00
committed by GitHub
parent 68ab75af24
commit 2f5f52fc3c
7 changed files with 331 additions and 0 deletions

View File

@@ -0,0 +1,164 @@
ARG BUILD_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:9.6
ARG PYTHON_VERSION=3.12
ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple
ARG UV_TRUSTED_HOST=repos.metax-tech.com
# may need passing a particular vllm version during build
ARG VLLM_VERSION
ARG MACA_VERSION
ARG CU_BRIDGE_VERSION=${MACA_VERSION}
#################### BASE BUILD IMAGE ####################
FROM ${BUILD_BASE_IMAGE} AS base
ARG UV_TRUSTED_HOST
# maca environment variables
ENV MACA_PATH=/opt/maca
ENV MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin
ENV CUCC_PATH="${MACA_PATH}/tools/cu-bridge"
ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR
ENV CUCC_CMAKE_ENTRY=2
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
ENV PATH=/opt/mxdriver/bin:${MACA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/tools/cu-bridge/tools:${MACA_PATH}/tools/cu-bridge/bin:${PATH}
ENV LD_LIBRARY_PATH=/opt/mxdriver/lib:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/ucx/lib:${LD_LIBRARY_PATH}
# uv environment variables
ENV VIRTUAL_ENV=/opt/venv
ENV UV_INDEX_STRATEGY="unsafe-best-match"
ENV UV_HTTP_TIMEOUT=6000
ENV UV_LINK_MODE=copy
ARG UV_EXTRA_INDEX_URL
ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL}
ARG UV_INDEX_URL
ENV UV_INDEX_URL=https://mirrors.aliyun.com/pypi/simple
ENV UV_TRUSTED_INDEX_HOST=mirrors.aliyun.com
ENV UV_OVERRIDE=/workspace/override.txt
# vllm compile option
ENV VLLM_INSTALL_PUNICA_KERNELS=1
# AI version arguments
ARG PYTHON_VERSION
ARG VLLM_VERSION
ARG VLLM_METAX_VERSION
ARG MACA_VERSION
ARG MEGATRON_VERSION
ARG SWIFT_VERSION
ARG CU_BRIDGE_VERSION
ARG TE_VERSION
WORKDIR /workspace
COPY override.txt /workspace/override.txt
COPY requirements_extra.txt /workspace/requirements_extra.txt
RUN printf "[metax-centos]\n\
name=Maca Driver Yum Repository\n\
baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\
enabled=1\n\
gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo
RUN dnf -y install python3-pip hostname && \
dnf clean all
RUN python3 -m pip install uv -i $UV_INDEX_URL --trusted-host ${UV_TRUSTED_INDEX_HOST} && \
uv venv /opt/venv --python=${PYTHON_VERSION}
RUN python3 --version && \
uv self version
RUN yum makecache && yum install -y \
unzip vim git openblas-devel make cmake \
ninja-build gcc g++ procps-ng \
libibverbs librdmacm libibumad \
&& yum clean all
RUN git clone --depth 1 --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git
RUN git clone --depth 1 --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git
RUN git clone --depth 1 --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git
RUN git clone --depth 1 --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git
# Step 1: install MACA SDK, Metax-Driver and cu-bridge
# Metax-Driver mainly contains vbios and kmd files, which are not needed in a container.
# Here we keep the mx-smi management tool. Kernel version mismatch errors are ignored.
RUN yum makecache && \
yum install -y metax-driver-${MACA_VERSION}* mxgvm && \
yum clean all && rm -rf /var/cache/yum /tmp/*
RUN printf "[maca-sdk]\n\
name=Maca Sdk Yum Repository\n\
baseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-$(uname -m)/\n\
enabled=1\n\
gpgcheck=0" > /etc/yum.repos.d/maca-sdk-rpm.repo
RUN yum makecache && \
yum install -y maca_sdk-${MACA_VERSION}* && \
yum clean all && rm -rf /var/cache/yum /tmp/*
RUN cd /tmp/ && \
export MACA_PATH=/opt/maca && \
curl -o ${CU_BRIDGE_VERSION}.zip -LsSf https://gitee.com/metax-maca/cu-bridge/repository/archive/${CU_BRIDGE_VERSION}.zip && \
unzip ${CU_BRIDGE_VERSION}.zip && \
mv cu-bridge-${CU_BRIDGE_VERSION} cu-bridge && \
chmod 755 cu-bridge -Rf && \
cd cu-bridge && \
mkdir build && cd build && \
cmake -DCMAKE_INSTALL_PREFIX=/opt/maca/tools/cu-bridge ../ && \
make && make install
# Step 2: trim unused MACA packages and install build prerequisites
RUN cd vLLM-metax && \
uv pip install -r requirements/build.txt && \
uv pip install build
RUN yum makecache && yum install -y \
gcc \
binutils \
procps-ng \
libibverbs \
librdmacm \
libibumad \
openblas \
numactl-libs \
&& yum clean all && rm -rf /var/cache/yum /tmp/*
# Step 3: install Metax python requirements
RUN cd vLLM-metax && \
UV_HTTP_TIMEOUT=960 uv pip install -r requirements/maca.txt --trusted-host ${UV_TRUSTED_HOST}
# Step 4: build vLLM with empty device to avoid CUDA dependency
RUN cd vllm && \
python3 use_existing_torch.py && \
uv pip install -r requirements/build.txt
RUN cd vllm && \
VLLM_TARGET_DEVICE=empty uv pip install -v . --no-build-isolation
# Step 5: build vLLM-metax
RUN cd vLLM-metax && \
uv pip install -r requirements/build.txt && \
python3 -m build -w -n && \
uv pip install dist/*.whl
# Step 6: install Megatron-LM
RUN sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py && \
cd Megatron-LM && \
uv pip install .
# Step 7: install transformer-engine
RUN uv pip install transformer_engine==${TE_VERSION} -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST}
# Step 8: patch and install ms-swift v4.1.0 with Megatron extra dependencies
RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py && \
cd ms-swift && \
uv pip install '.[megatron]'
# Step 9: install optional runtime dependencies used by swift 4.1.0
RUN uv pip install deepspeed -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST}
RUN uv pip install pip
RUN uv pip install -r requirements_extra.txt
RUN ln -sf ${CUDA_PATH}/bin/nvcc ${CUDA_PATH}/bin/cucc
# vllm installation may bring in incompatible CUDA-only wheels. Remove them here.
RUN uv pip uninstall flashinfer-python cupy-cuda12x flash-linear-attention fla-core
#################### FINAL IMAGE ####################

View File

@@ -0,0 +1,73 @@
ARG BUILD_BASE_IMAGE=mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:4.0.4-maca.ai3.5.3.5-torch2.8-py312-ubuntu22.04-amd64
ARG PYTHON_VERSION=3.12
FROM ${BUILD_BASE_IMAGE} AS base
# NOTE:
# This fast-build path inherits Python/Torch/TE from a prebuilt Metax release image.
# We keep the verified base image tag here instead of guessing a newer one.
# As a result, this path may lag behind the Megatron-SWIFT Quick Start recommendations.
# may need passing a particular vllm version during build
ARG VLLM_VERSION
ARG VLLM_METAX_VERSION
ARG MEGATRON_VERSION
ARG SWIFT_VERSION
ENV MACA_PATH=/opt/maca
ENV CUCC_CMAKE_ENTRY=2
ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR
ENV CUCC_PATH=${MACA_PATH}/tools/cu-bridge
ENV PATH=/opt/conda/bin:/opt/conda/condabin:${CUDA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${MACA_PATH}/bin:${PATH}
ENV LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
WORKDIR /workspace
COPY requirements_extra.txt /workspace/requirements_extra.txt
RUN echo $PATH
RUN apt-get update && apt-get install -y git && rm -rf /var/lib/apt/lists/*
# Initialize cu-bridge if it is not already prepared in the base image.
RUN if [ ! -d /root/cu-bridge ]; then \
${MACA_PATH}/tools/cu-bridge/tools/pre_make; \
fi
# Clone all GitHub sources while the external proxy is enabled.
RUN rm -rf /workspace/ms-swift /workspace/vLLM-metax /workspace/vllm /workspace/Megatron-LM
RUN git clone --depth 1 --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git
RUN git clone --depth 1 --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git
RUN git clone --depth 1 --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git
RUN git clone --depth 1 --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git
# install cmake
RUN pip install cmake ninja
# Step 1: build original vLLM for torch setup
RUN cd vllm && \
python3 use_existing_torch.py && \
pip install -r requirements/build.txt
# Step 2: build vLLM with empty device to avoid CUDA dependency
RUN cd vllm && \
VLLM_TARGET_DEVICE=empty pip install -v . --no-build-isolation
# Step 3: build vLLM-metax
RUN cd vLLM-metax && \
python3 use_existing_metax.py && \
pip install -r requirements/build.txt && \
python3 -m build -w -n && \
pip install dist/*.whl
# Step 4: patch and install Megatron-LM
RUN sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py && \
cd /workspace/Megatron-LM && \
pip install .
# Step 5: patch and install ms-swift v4.1.0 with its Megatron extra
RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py && \
cd ms-swift && \
pip install "transformers<5.4.0" && \
pip install '.[megatron]' && \
pip install -r /workspace/requirements_extra.txt
CMD ["bash"]

13
docker/Metax/4.1/build.sh Normal file
View File

@@ -0,0 +1,13 @@
docker build \
--network host \
-f Dockerfile.metax \
-t swift:v4.1.0 \
--build-arg VLLM_VERSION=v0.17.1 \
--build-arg VLLM_METAX_VERSION=v0.17.0 \
--build-arg MACA_VERSION=3.5.3 \
--build-arg MEGATRON_VERSION=core_v0.16.0 \
--build-arg SWIFT_VERSION=v4.1.0 \
--build-arg TE_VERSION=2.8.0 \
--build-arg CU_BRIDGE_VERSION=3.5.3 \
--no-cache \
.

View File

@@ -0,0 +1,10 @@
docker build \
--network host \
-f Dockerfile.with_metax_image \
-t swift:v4.1.0 \
--build-arg VLLM_VERSION=v0.17.1 \
--build-arg VLLM_METAX_VERSION=v0.17.0 \
--build-arg MEGATRON_VERSION=core_v0.16.0 \
--build-arg SWIFT_VERSION=v4.1.0 \
--no-cache \
.

View File

@@ -0,0 +1,5 @@
setuptools>=77.0.3,<80
datasets>=3.0,<4.0
flash-linear-attention
mcoplib
transformers<5.4.0

View File

@@ -0,0 +1,14 @@
decord
diffusers==0.35.2
evalscope>=1.0.0
evalscope[opencompass]
evalscope[vlmeval]
keye_vl_utils>=1.5.2
librosa
mpi4py
optimum==1.27.0
pytorchvideo
qwen_omni_utils>=0.0.9
qwen_vl_utils==0.0.14
soundfile
timm

View File

@@ -0,0 +1,52 @@
# 1. Build swift 4.1 image from a UBI9 base image
Full build from a minimal base image, using a venv virtual environment.
## 1.1. Build
``` bash
bash build.sh
```
## 1.2. Run a container
``` bash
docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \
--shm-size 100gb --ulimit memlock=-1 \
--security-opt seccomp=unconfined --security-opt apparmor=unconfined \
--device=/dev/dri --device=/dev/mxcd \
--name base_image \
${IMAGE_ID} bash
```
## 1.3. Activate the venv environment
``` bash
source /opt/venv/bin/activate
```
## 1.4. Run swift examples
``` bash
cd /workspace/ms-swift
bash examples/train/full/train.sh
```
# 2. Build swift 4.1 image from a Metax release image
Faster build based on the pre-built Metax release image.
## 2.1. Build
``` bash
bash build_from_metax_image.sh
```
## 2.2. Run a container
``` bash
docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \
--shm-size 100gb --ulimit memlock=-1 \
--security-opt seccomp=unconfined --security-opt apparmor=unconfined \
--device=/dev/dri --device=/dev/mxcd \
--name base_image \
${IMAGE_ID} bash
```
## 2.3. Run swift examples
``` bash
cd /workspace/ms-swift
bash examples/train/full/train.sh
```