From 467a2206e30b017e40c99edc5098f84d7a8d1119 Mon Sep 17 00:00:00 2001 From: vx120 <57470515+vx120@users.noreply.github.com> Date: Tue, 10 Mar 2026 16:11:49 +0800 Subject: [PATCH 1/2] add metax dockerfile and its requirements for ms-swift (#1643) --- docker/Metax/4.0/Dockerfile.metax | 214 ++++++++++++++++++ docker/Metax/4.0/Dockerfile.with_metax_image | 72 ++++++ docker/Metax/4.0/build.sh | 13 ++ docker/Metax/4.0/build_from_metax_image.sh | 11 + docker/Metax/4.0/override.txt | 3 + docker/Metax/4.0/requirements_extra.txt | 9 + .../Metax/4.0/swift_building_instructions.md | 43 ++++ 7 files changed, 365 insertions(+) create mode 100644 docker/Metax/4.0/Dockerfile.metax create mode 100644 docker/Metax/4.0/Dockerfile.with_metax_image create mode 100644 docker/Metax/4.0/build.sh create mode 100644 docker/Metax/4.0/build_from_metax_image.sh create mode 100644 docker/Metax/4.0/override.txt create mode 100644 docker/Metax/4.0/requirements_extra.txt create mode 100644 docker/Metax/4.0/swift_building_instructions.md diff --git a/docker/Metax/4.0/Dockerfile.metax b/docker/Metax/4.0/Dockerfile.metax new file mode 100644 index 00000000..5203496f --- /dev/null +++ b/docker/Metax/4.0/Dockerfile.metax @@ -0,0 +1,214 @@ +ARG BUILD_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:9.6 +ARG PYTHON_VERSION=3.12 +ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple +ARG UV_TRUSTED_HOST=repos.metax-tech.com + +# may need passing a particular vllm version during build +ARG VLLM_VERSION +ARG MACA_VERSION +ARG CU_BRIDGE_VERSION=${MACA_VERSION} + +#################### BASE BUILD IMAGE #################### +FROM ${BUILD_BASE_IMAGE} AS base +ARG UV_TRUSTED_HOST +# maca environment variables +ENV MACA_PATH=/opt/maca +ENV MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin +ENV CUCC_PATH="${MACA_PATH}/tools/cu-bridge" +ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR +ENV CUCC_CMAKE_ENTRY=2 +ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH" +ENV PATH=/opt/mxdriver/bin:${MACA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/tools/cu-bridge/tools:${MACA_PATH}/tools/cu-bridge/bin:${PATH} +ENV LD_LIBRARY_PATH=/opt/mxdriver/lib:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/ucx/lib:${LD_LIBRARY_PATH} + +# uv environment variables +ENV VIRTUAL_ENV=/opt/venv +ENV UV_INDEX_STRATEGY="unsafe-best-match" +ENV UV_HTTP_TIMEOUT=6000 +ENV UV_LINK_MODE=copy +ARG UV_EXTRA_INDEX_URL +ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL} +ARG UV_INDEX_URL +ENV UV_INDEX_URL=http://mirrors.aliyun.com/pypi/simple +ENV UV_TRUSTED_INDEX_HOST=mirrors.aliyun.com +ENV UV_OVERRIDE=/workspace/override.txt +# vllm compile option +ENV VLLM_INSTALL_PUNICA_KERNELS=1 + +# AI version arguments +ARG PYTHON_VERSION +ARG VLLM_VERSION +ARG VLLM_METAX_VERSION +ARG MACA_VERSION +ARG MEGATRON_VERSION +ARG SWIFT_VERSION +ARG CU_BRIDGE_VERSION +ARG TE_VERSION +# ARG UV_INDEX_URL + + +WORKDIR /workspace +COPY override.txt /workspace/override.txt +COPY requirements_extra.txt /workspace/requirements_extra.txt + +RUN printf "[metax-centos]\n\ +name=Maca Driver Yum Repository\n\ +baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\ +enabled=1\n\ +gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo + +RUN cat /etc/yum.repos.d/ubi.repo + +RUN dnf -y install python3-pip hostname && \ + dnf clean all + +RUN python3 -m pip install uv -i $UV_INDEX_URL --trusted-host ${UV_TRUSTED_INDEX_HOST} && \ + uv venv /opt/venv --python=${PYTHON_VERSION} + +RUN python3 --version && \ + uv self version + +RUN yum makecache && yum install -y \ + unzip vim git openblas-devel make cmake \ + ninja-build gcc g++ procps-ng \ + libibverbs librdmacm libibumad \ + && yum clean all + + +RUN git clone --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git +RUN git clone --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git +RUN git clone --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git +RUN git clone --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git + +# ====================== +# Step 1: install MACA SDK Metax-Driver and cu-bridge +# ====================== +RUN printf "[metax-centos]\n\ +name=Maca Driver Yum Repository\n\ +baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\ +enabled=1\n\ +gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo + +# would install the newest 3.1.0.x release +# Metax-Driver mainly contains vbios and kmd file, which are not needed in a container. +# Here we want to get the mx-smi management tool. +# kernel version mismatch errors are ignored +RUN yum makecache && \ + yum install -y metax-driver-${MACA_VERSION}* mxgvm && \ + yum clean all && rm -rf /var/cache/yum /tmp/* + +# Installing MACA SDK +RUN printf "[maca-sdk]\n\ +name=Maca Sdk Yum Repository\n\ +baseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-$(uname -m)/\n\ +enabled=1\n\ +gpgcheck=0" > /etc/yum.repos.d/maca-sdk-rpm.repo + +RUN yum makecache && \ + yum install -y maca_sdk-${MACA_VERSION}* && \ + yum clean all && rm -rf /var/cache/yum /tmp/* + +RUN cd /tmp/ && \ + export MACA_PATH=/opt/maca && \ + curl -o ${CU_BRIDGE_VERSION}.zip -LsSf https://gitee.com/metax-maca/cu-bridge/repository/archive/${CU_BRIDGE_VERSION}.zip && \ + unzip ${CU_BRIDGE_VERSION}.zip && \ + mv cu-bridge-${CU_BRIDGE_VERSION} cu-bridge && \ + chmod 755 cu-bridge -Rf && \ + cd cu-bridge && \ + mkdir build && cd ./build && \ + cmake -DCMAKE_INSTALL_PREFIX=/opt/maca/tools/cu-bridge ../ && \ + make && make install + +# ====================== +# Step 2: install Metax requirements +# ====================== + +RUN rpm -e --nodeps \ + mcflashattn_${MACA_VERSION} \ + mcflashinfer_${MACA_VERSION} \ + mxreport-${MACA_VERSION} \ + mccltests-${MACA_VERSION} && \ + find /opt/maca/ -type f -name "*.a" -delete && \ + yum clean all && rm -rf /var/cache/yum /tmp/* + + +RUN echo $PATH +ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple +ARG UV_TRUSTED_HOST=repos.metax-tech.com +RUN cd vLLM-metax \ + && uv pip install -r requirements/build.txt \ + && uv pip install build + + + +RUN yum makecache && yum install -y \ + gcc \ + binutils \ + procps-ng \ + libibverbs \ + librdmacm \ + libibumad \ + openblas \ + numactl-libs \ + && yum clean all && rm -rf /var/cache/yum /tmp/* + + +# ====================== +# Step 3: install Metax python requirements +# ====================== + + +RUN cd vLLM-metax \ + && UV_HTTP_TIMEOUT=960 uv pip install -r requirements/maca.txt --trusted-host ${UV_TRUSTED_HOST} + +# ====================== +# Step 4: Build vLLM with empty device (to avoid CUDA dependency) +# ====================== +RUN cd vllm \ + && python3 use_existing_torch.py \ + && uv pip install -r requirements/build.txt + +RUN cd vllm \ + && VLLM_TARGET_DEVICE=empty uv pip install -v . --no-build-isolation + +# ====================== +# Step 5: Build vLLM-metax +# ====================== +RUN uv pip list +RUN cd vLLM-metax \ + && uv pip install -r requirements/build.txt \ + && python3 -m build -w -n\ + && uv pip install dist/*.whl + + +# ====================== +# Step 6: Clone and patch Megatron-LM +# ====================== +RUN sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py +RUN cd Megatron-LM \ + && uv pip install . + +# ====================== +# Step 6: install TE +# ====================== + +RUN uv pip install transformer_engine==${TE_VERSION} -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST} +# ====================== +# Step 5: Clone, patch and install ms-swift +# ====================== + +RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py \ + && cd ms-swift \ + && uv pip install -r requirements.txt \ + && uv pip install . + +# ====================== +# Step 6: other requirements +# ====================== +RUN uv pip install deepspeed -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST} +RUN uv pip install pip +RUN uv pip install -r requirements_extra.txt +RUN ln -sf ${CUDA_PATH}/bin/nvcc ${CUDA_PATH}/bin/cucc +# Fix(hank): don't know why vllm installation also brings in flashinfer-python, remove it here. +RUN uv pip uninstall flashinfer-python cupy-cuda12x +#################### FINAL IMAGE #################### diff --git a/docker/Metax/4.0/Dockerfile.with_metax_image b/docker/Metax/4.0/Dockerfile.with_metax_image new file mode 100644 index 00000000..fb39bb36 --- /dev/null +++ b/docker/Metax/4.0/Dockerfile.with_metax_image @@ -0,0 +1,72 @@ +ARG BUILD_BASE_IMAGE=mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:3.10.3-maca.ai3.3.0.16-torch2.6-py310-ubuntu22.04-amd64 +ARG PYTHON_VERSION=3.10 + +FROM ${BUILD_BASE_IMAGE} AS base + + +# may need passing a particular vllm version during build +ARG VLLM_VERSION +ARG VLLM_METAX_VERSION +ARG MEGATRON_VERSION +ARG SWIFT_VERSION + +# --- 设置环境变量(可被 --build-arg 覆盖)--- +ENV MACA_PATH=/opt/maca +ENV CUCC_CMAKE_ENTRY=2 +ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR +ENV CUCC_PATH=${MACA_PATH}/tools/cu-bridge +ENV PATH=/opt/conda/bin:/opt/conda/condabin:${CUDA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${MACA_PATH}/bin:${PATH} +ENV LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH} +RUN echo $PATH +RUN apt install -y git +# 检查并初始化 cu-bridge +RUN if [ ! -d /root/cu-bridge ]; then \ + ${MACA_PATH}/tools/cu-bridge/tools/pre_make; \ + fi + +# ====================== +# Step 1: Clone and build original vLLM (for torch setup) +# ====================== +WORKDIR /workspace +RUN git clone --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git \ + && cd vllm \ + && python3 use_existing_torch.py \ + && pip install -r requirements/build.txt + +# ====================== +# Step 2: Build vLLM with empty device (to avoid CUDA dependency) +# ====================== +RUN cd vllm \ + && VLLM_TARGET_DEVICE=empty pip install -v . --no-build-isolation + + +# ====================== +# Step 3: Build vLLM-metax +# ====================== +RUN git clone --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git \ + && cd vLLM-metax \ + && python3 use_existing_metax.py \ + && pip install -r requirements/build.txt \ + && python3 -m build -w -n \ + && pip install dist/*.whl + +# ====================== +# Step 4: Clone and patch Megatron-LM +# ====================== +RUN git clone --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git \ + && sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py + +# ====================== +# Step 5: Clone, patch and install ms-swift +# ====================== +RUN rm -rf /workspace/ms-swift + +RUN git clone --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git \ + && sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py \ + && cd ms-swift \ + && pip install -r requirements.txt \ + && pip install . + + +# 默认命令 +CMD ["bash"] diff --git a/docker/Metax/4.0/build.sh b/docker/Metax/4.0/build.sh new file mode 100644 index 00000000..c009c218 --- /dev/null +++ b/docker/Metax/4.0/build.sh @@ -0,0 +1,13 @@ +docker build \ + --network host \ + -f Dockerfile.metax \ + -t swift:v4.0.0 \ + --build-arg VLLM_VERSION=v0.11.2 \ + --build-arg VLLM_METAX_VERSION=v0.11.2 \ + --build-arg MACA_VERSION=3.3.0 \ + --build-arg MEGATRON_VERSION=core_v0.15.0 \ + --build-arg SWIFT_VERSION=v4.0.0 \ + --build-arg TE_VERSION=2.8 \ + --build-arg CU_BRIDGE_VERSION=3.3.0 \ + --no-cache \ + . diff --git a/docker/Metax/4.0/build_from_metax_image.sh b/docker/Metax/4.0/build_from_metax_image.sh new file mode 100644 index 00000000..974fc2c4 --- /dev/null +++ b/docker/Metax/4.0/build_from_metax_image.sh @@ -0,0 +1,11 @@ +docker build \ + --network host \ + -f Dockerfile.with_metax_image \ + -t swift:v4.0.0 \ + --build-arg VLLM_VERSION=v0.11.2 \ + --build-arg VLLM_METAX_VERSION=v0.11.2 \ + --build-arg MEGATRON_VERSION=core_v0.15.0 \ + --build-arg SWIFT_VERSION=v4.0.0 \ + --progress=plain \ + --no-cache \ + . diff --git a/docker/Metax/4.0/override.txt b/docker/Metax/4.0/override.txt new file mode 100644 index 00000000..0377780f --- /dev/null +++ b/docker/Metax/4.0/override.txt @@ -0,0 +1,3 @@ +setuptools>=77.0.3,<80 +flash-linear-attention +mcoplib diff --git a/docker/Metax/4.0/requirements_extra.txt b/docker/Metax/4.0/requirements_extra.txt new file mode 100644 index 00000000..d4303247 --- /dev/null +++ b/docker/Metax/4.0/requirements_extra.txt @@ -0,0 +1,9 @@ +diffusers==0.35.2 +evalscope +librosa +mpi4py +ms-opencompass +optimum==1.27.0 +pytorchvideo +qwen_vl_utils==0.0.14 +timm diff --git a/docker/Metax/4.0/swift_building_instructions.md b/docker/Metax/4.0/swift_building_instructions.md new file mode 100644 index 00000000..433bd9d7 --- /dev/null +++ b/docker/Metax/4.0/swift_building_instructions.md @@ -0,0 +1,43 @@ +# 1. build swift image from a ubi9 docker image + Full build from a minimal base image, use venv virtual enviroment +## 1.1. build + ``` bash + bash build.sh + ``` +## 1.2. run a container + ``` bash + docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \ + --shm-size 100gb --ulimit memlock=-1 \ + --security-opt seccomp=unconfined --security-opt apparmor=unconfined \ + --device=/dev/dri --device=/dev/mxcd \ + --name base_image \ + ${IMAGE_ID} bash + ``` +## 1.3. activate venv environment + here we use venv rather than conda + ``` bash + source /opt/venv/bin/activate + ``` +## 1.4. run swift examples + cd /workspace/ms-swift + bash example/train/full/train.sh + +# 2. build swift image from metax release image + Fast build based on the pre-built Metax release image, use conda virtual enviroment +## 2.1. build + ``` bash + bash build_from_metax_image.sh + ``` +## 2.2. run a container + ``` bash + docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video \ + --shm-size 100gb --ulimit memlock=-1 \ + --security-opt seccomp=unconfined --security-opt apparmor=unconfined \ + --device=/dev/dri --device=/dev/mxcd \ + --name base_image \ + ${IMAGE_ID} bash + ``` +## 2.3. run swift examples + cd /workspace/ms-swift + bash example/train/full/train.sh + ``` From f4dbe65110830518a336eba106ed0d581cc37dda Mon Sep 17 00:00:00 2001 From: "Xingjun.Wang" Date: Fri, 13 Mar 2026 10:36:11 +0800 Subject: [PATCH 2/2] [Fix] fix dataset util (#1645) --- .../msdatasets/utils/hf_datasets_util.py | 67 ++++++++----------- tests/hub/test_download_dataset_file.py | 3 + 2 files changed, 31 insertions(+), 39 deletions(-) diff --git a/modelscope/msdatasets/utils/hf_datasets_util.py b/modelscope/msdatasets/utils/hf_datasets_util.py index ac02443d..bde8965b 100644 --- a/modelscope/msdatasets/utils/hf_datasets_util.py +++ b/modelscope/msdatasets/utils/hf_datasets_util.py @@ -130,40 +130,7 @@ ExpandDatasetProperty_T = Literal[ # Patch datasets features -# In datasets 4.0+, the List type is the native feature type; -# in datasets <4.0, Sequence (a dataclass) serves that role. -_ListBase = DatasetList if DatasetList is not None else SequenceHf - - -@dataclass(repr=False) -class ListMs(_ListBase): - """Feature type for large list data composed of child feature data type. - - It is backed by `pyarrow.ListType`, which uses 32-bit offsets or a fixed length. - - Args: - feature ([`FeatureType`]): - Child feature data type of each item within the large list. - length (optional `int`, default to -1): - Length of the list if it is fixed. - Defaults to -1 which means an arbitrary length. - """ - - feature: Any - length: int = -1 - id: Optional[str] = field(default=None, repr=False) - # Automatically constructed - pa_type: ClassVar[Any] = None - _type: str = field(default='List', init=False, repr=False) - - def __repr__(self): - if self.length != -1: - return f'{type(self).__name__}({self.feature}, length={self.length})' - else: - return f'{type(self).__name__}({self.feature})' - - -_FEATURE_TYPES['List'] = ListMs +_NativeList = DatasetList if DatasetList is not None else SequenceHf def generate_from_dict_ms(obj: Any): @@ -202,9 +169,10 @@ def generate_from_dict_ms(obj: Any): if class_type == LargeList: feature = obj.pop('feature') return LargeList(generate_from_dict_ms(feature), **obj) - if class_type == ListMs: + # Handle the native List type (datasets 4.0+) as well as Sequence-based + if _NativeList is not None and (class_type is _NativeList or issubclass(class_type, _NativeList)): feature = obj.pop('feature') - return ListMs(generate_from_dict_ms(feature), **obj) + return _NativeList(generate_from_dict_ms(feature), **obj) field_names = {f.name for f in fields(class_type)} return class_type(**{k: v for k, v in obj.items() if k in field_names}) @@ -213,9 +181,30 @@ def generate_from_dict_ms(obj: Any): def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) -> str: url_or_filename = str(url_or_filename) if url_or_filename.startswith('hf://'): - # hf:// URLs are handled natively by cached_path via HfApi.hf_hub_download, - # which uses config.HF_ENDPOINT (already set to ModelScope endpoint). - pass + # hf:// URLs (e.g. hf://datasets/{owner}/{name}@{revision}/{file_path}) + hf_path = url_or_filename[len('hf://'):] + # Strip leading resource type prefix (e.g. "datasets/") + for _prefix in ('datasets/', 'models/'): + if hf_path.startswith(_prefix): + hf_path = hf_path[len(_prefix):] + break + # Extract revision and file_path from "{owner}/{name}@{revision}/{file_path}" + if '@' in hf_path: + at_idx = hf_path.index('@') + after_at = hf_path[at_idx + 1:] + slash_idx = after_at.find('/') + if slash_idx == -1: + revision = after_at + file_path = '' + else: + revision = after_at[:slash_idx] + file_path = after_at[slash_idx + 1:] + else: + parts = hf_path.split('/', 2) + revision = DEFAULT_DATASET_REVISION + file_path = parts[2] if len(parts) > 2 else '' + params = urlencode({'Source': 'SDK', 'Revision': revision, 'FilePath': file_path}) + url_or_filename = self._base_path + params elif is_relative_path(url_or_filename): revision = DEFAULT_DATASET_REVISION # Note: make sure the FilePath is the last param diff --git a/tests/hub/test_download_dataset_file.py b/tests/hub/test_download_dataset_file.py index 8e8712f5..3f8c3779 100644 --- a/tests/hub/test_download_dataset_file.py +++ b/tests/hub/test_download_dataset_file.py @@ -7,6 +7,7 @@ import unittest from modelscope.hub.file_download import dataset_file_download from modelscope.hub.snapshot_download import dataset_snapshot_download +from modelscope.utils.test_utils import test_level class DownloadDatasetTest(unittest.TestCase): @@ -14,6 +15,7 @@ class DownloadDatasetTest(unittest.TestCase): def setUp(self): pass + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_dataset_file_download(self): dataset_id = 'citest/test_dataset_download' file_path = 'open_qa.jsonl' @@ -67,6 +69,7 @@ class DownloadDatasetTest(unittest.TestCase): file_modify_time2 = os.path.getmtime(cache_file_path) assert file_modify_time == file_modify_time2 + @unittest.skipUnless(test_level() >= 2, 'skip test in current test level') def test_dataset_snapshot_download(self): dataset_id = 'citest/test_dataset_download' file_path = 'open_qa.jsonl'