mirror of
https://github.com/modelscope/modelscope.git
synced 2026-05-18 05:05:00 +02:00
Merge commit 'f4dbe65110830518a336eba106ed0d581cc37dda' into release/1.35
This commit is contained in:
214
docker/Metax/4.0/Dockerfile.metax
Normal file
214
docker/Metax/4.0/Dockerfile.metax
Normal file
@@ -0,0 +1,214 @@
|
||||
ARG BUILD_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:9.6
|
||||
ARG PYTHON_VERSION=3.12
|
||||
ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple
|
||||
ARG UV_TRUSTED_HOST=repos.metax-tech.com
|
||||
|
||||
# may need passing a particular vllm version during build
|
||||
ARG VLLM_VERSION
|
||||
ARG MACA_VERSION
|
||||
ARG CU_BRIDGE_VERSION=${MACA_VERSION}
|
||||
|
||||
#################### BASE BUILD IMAGE ####################
|
||||
FROM ${BUILD_BASE_IMAGE} AS base
|
||||
ARG UV_TRUSTED_HOST
|
||||
# maca environment variables
|
||||
ENV MACA_PATH=/opt/maca
|
||||
ENV MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin
|
||||
ENV CUCC_PATH="${MACA_PATH}/tools/cu-bridge"
|
||||
ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR
|
||||
ENV CUCC_CMAKE_ENTRY=2
|
||||
ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
|
||||
ENV PATH=/opt/mxdriver/bin:${MACA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/tools/cu-bridge/tools:${MACA_PATH}/tools/cu-bridge/bin:${PATH}
|
||||
ENV LD_LIBRARY_PATH=/opt/mxdriver/lib:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/ucx/lib:${LD_LIBRARY_PATH}
|
||||
|
||||
# uv environment variables
|
||||
ENV VIRTUAL_ENV=/opt/venv
|
||||
ENV UV_INDEX_STRATEGY="unsafe-best-match"
|
||||
ENV UV_HTTP_TIMEOUT=6000
|
||||
ENV UV_LINK_MODE=copy
|
||||
ARG UV_EXTRA_INDEX_URL
|
||||
ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL}
|
||||
ARG UV_INDEX_URL
|
||||
ENV UV_INDEX_URL=http://mirrors.aliyun.com/pypi/simple
|
||||
ENV UV_TRUSTED_INDEX_HOST=mirrors.aliyun.com
|
||||
ENV UV_OVERRIDE=/workspace/override.txt
|
||||
# vllm compile option
|
||||
ENV VLLM_INSTALL_PUNICA_KERNELS=1
|
||||
|
||||
# AI version arguments
|
||||
ARG PYTHON_VERSION
|
||||
ARG VLLM_VERSION
|
||||
ARG VLLM_METAX_VERSION
|
||||
ARG MACA_VERSION
|
||||
ARG MEGATRON_VERSION
|
||||
ARG SWIFT_VERSION
|
||||
ARG CU_BRIDGE_VERSION
|
||||
ARG TE_VERSION
|
||||
# ARG UV_INDEX_URL
|
||||
|
||||
|
||||
WORKDIR /workspace
|
||||
COPY override.txt /workspace/override.txt
|
||||
COPY requirements_extra.txt /workspace/requirements_extra.txt
|
||||
|
||||
RUN printf "[metax-centos]\n\
|
||||
name=Maca Driver Yum Repository\n\
|
||||
baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\
|
||||
enabled=1\n\
|
||||
gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo
|
||||
|
||||
RUN cat /etc/yum.repos.d/ubi.repo
|
||||
|
||||
RUN dnf -y install python3-pip hostname && \
|
||||
dnf clean all
|
||||
|
||||
RUN python3 -m pip install uv -i $UV_INDEX_URL --trusted-host ${UV_TRUSTED_INDEX_HOST} && \
|
||||
uv venv /opt/venv --python=${PYTHON_VERSION}
|
||||
|
||||
RUN python3 --version && \
|
||||
uv self version
|
||||
|
||||
RUN yum makecache && yum install -y \
|
||||
unzip vim git openblas-devel make cmake \
|
||||
ninja-build gcc g++ procps-ng \
|
||||
libibverbs librdmacm libibumad \
|
||||
&& yum clean all
|
||||
|
||||
|
||||
RUN git clone --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git
|
||||
RUN git clone --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git
|
||||
RUN git clone --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git
|
||||
RUN git clone --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git
|
||||
|
||||
# ======================
|
||||
# Step 1: install MACA SDK Metax-Driver and cu-bridge
|
||||
# ======================
|
||||
RUN printf "[metax-centos]\n\
|
||||
name=Maca Driver Yum Repository\n\
|
||||
baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\
|
||||
enabled=1\n\
|
||||
gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo
|
||||
|
||||
# would install the newest 3.1.0.x release
|
||||
# Metax-Driver mainly contains vbios and kmd file, which are not needed in a container.
|
||||
# Here we want to get the mx-smi management tool.
|
||||
# kernel version mismatch errors are ignored
|
||||
RUN yum makecache && \
|
||||
yum install -y metax-driver-${MACA_VERSION}* mxgvm && \
|
||||
yum clean all && rm -rf /var/cache/yum /tmp/*
|
||||
|
||||
# Installing MACA SDK
|
||||
RUN printf "[maca-sdk]\n\
|
||||
name=Maca Sdk Yum Repository\n\
|
||||
baseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-$(uname -m)/\n\
|
||||
enabled=1\n\
|
||||
gpgcheck=0" > /etc/yum.repos.d/maca-sdk-rpm.repo
|
||||
|
||||
RUN yum makecache && \
|
||||
yum install -y maca_sdk-${MACA_VERSION}* && \
|
||||
yum clean all && rm -rf /var/cache/yum /tmp/*
|
||||
|
||||
RUN cd /tmp/ && \
|
||||
export MACA_PATH=/opt/maca && \
|
||||
curl -o ${CU_BRIDGE_VERSION}.zip -LsSf https://gitee.com/metax-maca/cu-bridge/repository/archive/${CU_BRIDGE_VERSION}.zip && \
|
||||
unzip ${CU_BRIDGE_VERSION}.zip && \
|
||||
mv cu-bridge-${CU_BRIDGE_VERSION} cu-bridge && \
|
||||
chmod 755 cu-bridge -Rf && \
|
||||
cd cu-bridge && \
|
||||
mkdir build && cd ./build && \
|
||||
cmake -DCMAKE_INSTALL_PREFIX=/opt/maca/tools/cu-bridge ../ && \
|
||||
make && make install
|
||||
|
||||
# ======================
|
||||
# Step 2: install Metax requirements
|
||||
# ======================
|
||||
|
||||
RUN rpm -e --nodeps \
|
||||
mcflashattn_${MACA_VERSION} \
|
||||
mcflashinfer_${MACA_VERSION} \
|
||||
mxreport-${MACA_VERSION} \
|
||||
mccltests-${MACA_VERSION} && \
|
||||
find /opt/maca/ -type f -name "*.a" -delete && \
|
||||
yum clean all && rm -rf /var/cache/yum /tmp/*
|
||||
|
||||
|
||||
RUN echo $PATH
|
||||
ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple
|
||||
ARG UV_TRUSTED_HOST=repos.metax-tech.com
|
||||
RUN cd vLLM-metax \
|
||||
&& uv pip install -r requirements/build.txt \
|
||||
&& uv pip install build
|
||||
|
||||
|
||||
|
||||
RUN yum makecache && yum install -y \
|
||||
gcc \
|
||||
binutils \
|
||||
procps-ng \
|
||||
libibverbs \
|
||||
librdmacm \
|
||||
libibumad \
|
||||
openblas \
|
||||
numactl-libs \
|
||||
&& yum clean all && rm -rf /var/cache/yum /tmp/*
|
||||
|
||||
|
||||
# ======================
|
||||
# Step 3: install Metax python requirements
|
||||
# ======================
|
||||
|
||||
|
||||
RUN cd vLLM-metax \
|
||||
&& UV_HTTP_TIMEOUT=960 uv pip install -r requirements/maca.txt --trusted-host ${UV_TRUSTED_HOST}
|
||||
|
||||
# ======================
|
||||
# Step 4: Build vLLM with empty device (to avoid CUDA dependency)
|
||||
# ======================
|
||||
RUN cd vllm \
|
||||
&& python3 use_existing_torch.py \
|
||||
&& uv pip install -r requirements/build.txt
|
||||
|
||||
RUN cd vllm \
|
||||
&& VLLM_TARGET_DEVICE=empty uv pip install -v . --no-build-isolation
|
||||
|
||||
# ======================
|
||||
# Step 5: Build vLLM-metax
|
||||
# ======================
|
||||
RUN uv pip list
|
||||
RUN cd vLLM-metax \
|
||||
&& uv pip install -r requirements/build.txt \
|
||||
&& python3 -m build -w -n\
|
||||
&& uv pip install dist/*.whl
|
||||
|
||||
|
||||
# ======================
|
||||
# Step 6: Clone and patch Megatron-LM
|
||||
# ======================
|
||||
RUN sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py
|
||||
RUN cd Megatron-LM \
|
||||
&& uv pip install .
|
||||
|
||||
# ======================
|
||||
# Step 6: install TE
|
||||
# ======================
|
||||
|
||||
RUN uv pip install transformer_engine==${TE_VERSION} -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST}
|
||||
# ======================
|
||||
# Step 5: Clone, patch and install ms-swift
|
||||
# ======================
|
||||
|
||||
RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py \
|
||||
&& cd ms-swift \
|
||||
&& uv pip install -r requirements.txt \
|
||||
&& uv pip install .
|
||||
|
||||
# ======================
|
||||
# Step 6: other requirements
|
||||
# ======================
|
||||
RUN uv pip install deepspeed -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST}
|
||||
RUN uv pip install pip
|
||||
RUN uv pip install -r requirements_extra.txt
|
||||
RUN ln -sf ${CUDA_PATH}/bin/nvcc ${CUDA_PATH}/bin/cucc
|
||||
# Fix(hank): don't know why vllm installation also brings in flashinfer-python, remove it here.
|
||||
RUN uv pip uninstall flashinfer-python cupy-cuda12x
|
||||
#################### FINAL IMAGE ####################
|
||||
72
docker/Metax/4.0/Dockerfile.with_metax_image
Normal file
72
docker/Metax/4.0/Dockerfile.with_metax_image
Normal file
@@ -0,0 +1,72 @@
|
||||
ARG BUILD_BASE_IMAGE=mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:3.10.3-maca.ai3.3.0.16-torch2.6-py310-ubuntu22.04-amd64
|
||||
ARG PYTHON_VERSION=3.10
|
||||
|
||||
FROM ${BUILD_BASE_IMAGE} AS base
|
||||
|
||||
|
||||
# may need passing a particular vllm version during build
|
||||
ARG VLLM_VERSION
|
||||
ARG VLLM_METAX_VERSION
|
||||
ARG MEGATRON_VERSION
|
||||
ARG SWIFT_VERSION
|
||||
|
||||
# --- 设置环境变量(可被 --build-arg 覆盖)---
|
||||
ENV MACA_PATH=/opt/maca
|
||||
ENV CUCC_CMAKE_ENTRY=2
|
||||
ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR
|
||||
ENV CUCC_PATH=${MACA_PATH}/tools/cu-bridge
|
||||
ENV PATH=/opt/conda/bin:/opt/conda/condabin:${CUDA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${MACA_PATH}/bin:${PATH}
|
||||
ENV LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
|
||||
RUN echo $PATH
|
||||
RUN apt install -y git
|
||||
# 检查并初始化 cu-bridge
|
||||
RUN if [ ! -d /root/cu-bridge ]; then \
|
||||
${MACA_PATH}/tools/cu-bridge/tools/pre_make; \
|
||||
fi
|
||||
|
||||
# ======================
|
||||
# Step 1: Clone and build original vLLM (for torch setup)
|
||||
# ======================
|
||||
WORKDIR /workspace
|
||||
RUN git clone --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git \
|
||||
&& cd vllm \
|
||||
&& python3 use_existing_torch.py \
|
||||
&& pip install -r requirements/build.txt
|
||||
|
||||
# ======================
|
||||
# Step 2: Build vLLM with empty device (to avoid CUDA dependency)
|
||||
# ======================
|
||||
RUN cd vllm \
|
||||
&& VLLM_TARGET_DEVICE=empty pip install -v . --no-build-isolation
|
||||
|
||||
|
||||
# ======================
|
||||
# Step 3: Build vLLM-metax
|
||||
# ======================
|
||||
RUN git clone --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git \
|
||||
&& cd vLLM-metax \
|
||||
&& python3 use_existing_metax.py \
|
||||
&& pip install -r requirements/build.txt \
|
||||
&& python3 -m build -w -n \
|
||||
&& pip install dist/*.whl
|
||||
|
||||
# ======================
|
||||
# Step 4: Clone and patch Megatron-LM
|
||||
# ======================
|
||||
RUN git clone --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git \
|
||||
&& sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py
|
||||
|
||||
# ======================
|
||||
# Step 5: Clone, patch and install ms-swift
|
||||
# ======================
|
||||
RUN rm -rf /workspace/ms-swift
|
||||
|
||||
RUN git clone --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git \
|
||||
&& sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py \
|
||||
&& cd ms-swift \
|
||||
&& pip install -r requirements.txt \
|
||||
&& pip install .
|
||||
|
||||
|
||||
# 默认命令
|
||||
CMD ["bash"]
|
||||
13
docker/Metax/4.0/build.sh
Normal file
13
docker/Metax/4.0/build.sh
Normal file
@@ -0,0 +1,13 @@
|
||||
docker build \
|
||||
--network host \
|
||||
-f Dockerfile.metax \
|
||||
-t swift:v4.0.0 \
|
||||
--build-arg VLLM_VERSION=v0.11.2 \
|
||||
--build-arg VLLM_METAX_VERSION=v0.11.2 \
|
||||
--build-arg MACA_VERSION=3.3.0 \
|
||||
--build-arg MEGATRON_VERSION=core_v0.15.0 \
|
||||
--build-arg SWIFT_VERSION=v4.0.0 \
|
||||
--build-arg TE_VERSION=2.8 \
|
||||
--build-arg CU_BRIDGE_VERSION=3.3.0 \
|
||||
--no-cache \
|
||||
.
|
||||
11
docker/Metax/4.0/build_from_metax_image.sh
Normal file
11
docker/Metax/4.0/build_from_metax_image.sh
Normal file
@@ -0,0 +1,11 @@
|
||||
docker build \
|
||||
--network host \
|
||||
-f Dockerfile.with_metax_image \
|
||||
-t swift:v4.0.0 \
|
||||
--build-arg VLLM_VERSION=v0.11.2 \
|
||||
--build-arg VLLM_METAX_VERSION=v0.11.2 \
|
||||
--build-arg MEGATRON_VERSION=core_v0.15.0 \
|
||||
--build-arg SWIFT_VERSION=v4.0.0 \
|
||||
--progress=plain \
|
||||
--no-cache \
|
||||
.
|
||||
3
docker/Metax/4.0/override.txt
Normal file
3
docker/Metax/4.0/override.txt
Normal file
@@ -0,0 +1,3 @@
|
||||
setuptools>=77.0.3,<80
|
||||
flash-linear-attention
|
||||
mcoplib
|
||||
9
docker/Metax/4.0/requirements_extra.txt
Normal file
9
docker/Metax/4.0/requirements_extra.txt
Normal file
@@ -0,0 +1,9 @@
|
||||
diffusers==0.35.2
|
||||
evalscope
|
||||
librosa
|
||||
mpi4py
|
||||
ms-opencompass
|
||||
optimum==1.27.0
|
||||
pytorchvideo
|
||||
qwen_vl_utils==0.0.14
|
||||
timm
|
||||
43
docker/Metax/4.0/swift_building_instructions.md
Normal file
43
docker/Metax/4.0/swift_building_instructions.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# 1. build swift image from a ubi9 docker image
|
||||
Full build from a minimal base image, use venv virtual enviroment
|
||||
## 1.1. build
|
||||
``` bash
|
||||
bash build.sh
|
||||
```
|
||||
## 1.2. run a container
|
||||
``` bash
|
||||
docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \
|
||||
--shm-size 100gb --ulimit memlock=-1 \
|
||||
--security-opt seccomp=unconfined --security-opt apparmor=unconfined \
|
||||
--device=/dev/dri --device=/dev/mxcd \
|
||||
--name base_image \
|
||||
${IMAGE_ID} bash
|
||||
```
|
||||
## 1.3. activate venv environment
|
||||
here we use venv rather than conda
|
||||
``` bash
|
||||
source /opt/venv/bin/activate
|
||||
```
|
||||
## 1.4. run swift examples
|
||||
cd /workspace/ms-swift
|
||||
bash example/train/full/train.sh
|
||||
|
||||
# 2. build swift image from metax release image
|
||||
Fast build based on the pre-built Metax release image, use conda virtual enviroment
|
||||
## 2.1. build
|
||||
``` bash
|
||||
bash build_from_metax_image.sh
|
||||
```
|
||||
## 2.2. run a container
|
||||
``` bash
|
||||
docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \
|
||||
--shm-size 100gb --ulimit memlock=-1 \
|
||||
--security-opt seccomp=unconfined --security-opt apparmor=unconfined \
|
||||
--device=/dev/dri --device=/dev/mxcd \
|
||||
--name base_image \
|
||||
${IMAGE_ID} bash
|
||||
```
|
||||
## 2.3. run swift examples
|
||||
cd /workspace/ms-swift
|
||||
bash example/train/full/train.sh
|
||||
```
|
||||
Reference in New Issue
Block a user