Merge commit 'f4dbe65110830518a336eba106ed0d581cc37dda' into release/1.35

2026-05-18 05:05:00 +02:00 · 2026-03-13 11:10:09 +08:00
parent 94527f0053 f4dbe65110
commit 368f6608cc
9 changed files with 396 additions and 39 deletions
--- a/docker/Metax/4.0/Dockerfile.metax
+++ b/docker/Metax/4.0/Dockerfile.metax
@@ -0,0 +1,214 @@
+ARG BUILD_BASE_IMAGE=registry.access.redhat.com/ubi9/ubi:9.6
+ARG PYTHON_VERSION=3.12
+ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple
+ARG UV_TRUSTED_HOST=repos.metax-tech.com
+
+# may need passing a particular vllm version during build
+ARG VLLM_VERSION
+ARG MACA_VERSION
+ARG CU_BRIDGE_VERSION=${MACA_VERSION}
+
+#################### BASE BUILD IMAGE ####################
+FROM ${BUILD_BASE_IMAGE} AS base
+ARG UV_TRUSTED_HOST
+# maca environment variables
+ENV MACA_PATH=/opt/maca
+ENV MACA_CLANG_PATH=/opt/maca/mxgpu_llvm/bin
+ENV CUCC_PATH="${MACA_PATH}/tools/cu-bridge"
+ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR
+ENV CUCC_CMAKE_ENTRY=2
+ENV PATH="/opt/venv/bin:/root/.local/bin:$PATH"
+ENV PATH=/opt/mxdriver/bin:${MACA_PATH}/bin:${MACA_PATH}/mxgpu_llvm/bin:${MACA_PATH}/tools/cu-bridge/tools:${MACA_PATH}/tools/cu-bridge/bin:${PATH}
+ENV LD_LIBRARY_PATH=/opt/mxdriver/lib:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${MACA_PATH}/ompi/lib:${MACA_PATH}/ucx/lib:${LD_LIBRARY_PATH}
+
+# uv environment variables
+ENV VIRTUAL_ENV=/opt/venv
+ENV UV_INDEX_STRATEGY="unsafe-best-match"
+ENV UV_HTTP_TIMEOUT=6000
+ENV UV_LINK_MODE=copy
+ARG UV_EXTRA_INDEX_URL
+ENV UV_EXTRA_INDEX_URL=${UV_EXTRA_INDEX_URL}
+ARG UV_INDEX_URL
+ENV UV_INDEX_URL=http://mirrors.aliyun.com/pypi/simple
+ENV UV_TRUSTED_INDEX_HOST=mirrors.aliyun.com
+ENV UV_OVERRIDE=/workspace/override.txt
+# vllm compile option
+ENV VLLM_INSTALL_PUNICA_KERNELS=1
+
+# AI version arguments
+ARG PYTHON_VERSION
+ARG VLLM_VERSION
+ARG VLLM_METAX_VERSION
+ARG MACA_VERSION
+ARG MEGATRON_VERSION
+ARG SWIFT_VERSION
+ARG CU_BRIDGE_VERSION
+ARG TE_VERSION
+# ARG UV_INDEX_URL
+
+
+WORKDIR /workspace
+COPY override.txt /workspace/override.txt
+COPY requirements_extra.txt /workspace/requirements_extra.txt
+
+RUN printf "[metax-centos]\n\
+name=Maca Driver Yum Repository\n\
+baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\
+enabled=1\n\
+gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo
+
+RUN cat /etc/yum.repos.d/ubi.repo
+
+RUN dnf -y install python3-pip  hostname && \
+    dnf clean all
+
+RUN python3 -m pip install  uv -i $UV_INDEX_URL --trusted-host ${UV_TRUSTED_INDEX_HOST} && \
+    uv venv /opt/venv --python=${PYTHON_VERSION}
+
+RUN python3 --version && \
+    uv self version
+
+RUN yum makecache && yum install -y \
+    unzip vim git openblas-devel make cmake \
+    ninja-build gcc g++ procps-ng \
+    libibverbs librdmacm libibumad \
+    && yum clean all
+
+
+RUN git clone --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git
+RUN git clone --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git
+RUN git clone --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git
+RUN git clone --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git
+
+# ======================
+# Step 1: install MACA SDK  Metax-Driver and cu-bridge
+# ======================
+RUN printf "[metax-centos]\n\
+name=Maca Driver Yum Repository\n\
+baseurl=https://repos.metax-tech.com/r/metax-driver-centos-$(uname -m)/\n\
+enabled=1\n\
+gpgcheck=0" > /etc/yum.repos.d/metax-driver-centos.repo
+
+# would install the newest 3.1.0.x release
+# Metax-Driver mainly contains vbios and kmd file, which are not needed in a container.
+# Here we want to get the mx-smi management tool.
+# kernel version mismatch errors are ignored
+RUN yum makecache && \
+    yum install -y metax-driver-${MACA_VERSION}* mxgvm && \
+    yum clean all && rm -rf /var/cache/yum /tmp/*
+
+# Installing MACA SDK
+RUN printf "[maca-sdk]\n\
+name=Maca Sdk Yum Repository\n\
+baseurl=https://repos.metax-tech.com/r/maca-sdk-rpm-$(uname -m)/\n\
+enabled=1\n\
+gpgcheck=0" > /etc/yum.repos.d/maca-sdk-rpm.repo
+
+RUN yum makecache && \
+    yum install -y maca_sdk-${MACA_VERSION}* && \
+    yum clean all && rm -rf /var/cache/yum /tmp/*
+
+RUN cd /tmp/ && \
+    export MACA_PATH=/opt/maca && \
+    curl -o ${CU_BRIDGE_VERSION}.zip -LsSf https://gitee.com/metax-maca/cu-bridge/repository/archive/${CU_BRIDGE_VERSION}.zip && \
+    unzip ${CU_BRIDGE_VERSION}.zip && \
+    mv cu-bridge-${CU_BRIDGE_VERSION} cu-bridge && \
+    chmod 755 cu-bridge -Rf && \
+    cd cu-bridge && \
+    mkdir build && cd ./build && \
+    cmake -DCMAKE_INSTALL_PREFIX=/opt/maca/tools/cu-bridge ../ && \
+    make && make install
+
+# ======================
+# Step 2: install Metax  requirements
+# ======================
+
+RUN rpm -e --nodeps \
+        mcflashattn_${MACA_VERSION} \
+        mcflashinfer_${MACA_VERSION} \
+        mxreport-${MACA_VERSION} \
+        mccltests-${MACA_VERSION} && \
+    find /opt/maca/ -type f -name "*.a" -delete && \
+    yum clean all && rm -rf /var/cache/yum /tmp/*
+
+
+RUN echo $PATH
+ARG UV_EXTRA_INDEX_URL=https://repos.metax-tech.com/r/maca-pypi/simple
+ARG UV_TRUSTED_HOST=repos.metax-tech.com
+RUN cd vLLM-metax \
+    &&  uv pip install -r requirements/build.txt \
+    && uv pip install build
+
+
+
+RUN yum makecache && yum install -y \
+    gcc \
+    binutils \
+    procps-ng \
+    libibverbs \
+    librdmacm \
+    libibumad \
+    openblas \
+    numactl-libs \
+    && yum clean all && rm -rf /var/cache/yum /tmp/*
+
+
+# ======================
+# Step 3: install Metax python requirements
+# ======================
+
+
+RUN cd vLLM-metax \
+    && UV_HTTP_TIMEOUT=960 uv pip install -r requirements/maca.txt --trusted-host ${UV_TRUSTED_HOST}
+
+# ======================
+# Step 4: Build vLLM with empty device (to avoid CUDA dependency)
+# ======================
+RUN cd vllm \
+    && python3 use_existing_torch.py \
+    && uv pip install -r requirements/build.txt
+
+RUN cd vllm \
+    && VLLM_TARGET_DEVICE=empty uv pip install -v . --no-build-isolation
+
+# ======================
+# Step 5: Build vLLM-metax
+# ======================
+RUN  uv pip list
+RUN cd vLLM-metax \
+    && uv pip install -r requirements/build.txt \
+    && python3 -m build -w -n\
+    && uv pip install dist/*.whl
+
+
+# ======================
+# Step 6: Clone and patch Megatron-LM
+# ======================
+RUN  sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py
+RUN cd Megatron-LM \
+    && uv pip install .
+
+# ======================
+# Step 6: install TE
+# ======================
+
+RUN uv pip install transformer_engine==${TE_VERSION} -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST}
+# ======================
+# Step 5: Clone, patch and install ms-swift
+# ======================
+
+RUN sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py \
+    && cd ms-swift \
+    && uv pip install -r requirements.txt \
+    && uv pip install .
+
+# ======================
+# Step 6: other requirements
+# ======================
+RUN uv pip install deepspeed -i https://repos.metax-tech.com/r/maca-pypi/simple --trusted-host ${UV_TRUSTED_HOST}
+RUN uv pip install pip
+RUN uv pip install -r requirements_extra.txt
+RUN ln -sf ${CUDA_PATH}/bin/nvcc ${CUDA_PATH}/bin/cucc
+# Fix(hank): don't know why vllm installation also brings in flashinfer-python, remove it here.
+RUN uv pip uninstall flashinfer-python cupy-cuda12x
+#################### FINAL IMAGE ####################
--- a/docker/Metax/4.0/Dockerfile.with_metax_image
+++ b/docker/Metax/4.0/Dockerfile.with_metax_image
@@ -0,0 +1,72 @@
+ARG BUILD_BASE_IMAGE=mx-devops-acr-cn-shanghai.cr.volces.com/opensource/public-ai-release/maca/ms-swift:3.10.3-maca.ai3.3.0.16-torch2.6-py310-ubuntu22.04-amd64
+ARG PYTHON_VERSION=3.10
+
+FROM ${BUILD_BASE_IMAGE} AS base
+
+
+# may need passing a particular vllm version during build
+ARG VLLM_VERSION
+ARG VLLM_METAX_VERSION
+ARG MEGATRON_VERSION
+ARG SWIFT_VERSION
+
+# --- 设置环境变量（可被 --build-arg 覆盖）---
+ENV MACA_PATH=/opt/maca
+ENV CUCC_CMAKE_ENTRY=2
+ENV CUDA_PATH=/root/cu-bridge/CUDA_DIR
+ENV CUCC_PATH=${MACA_PATH}/tools/cu-bridge
+ENV PATH=/opt/conda/bin:/opt/conda/condabin:${CUDA_PATH}/bin:${CUCC_PATH}/tools:${CUCC_PATH}/bin:${MACA_PATH}/bin:${PATH}
+ENV LD_LIBRARY_PATH=${CUDA_PATH}/lib64:${MACA_PATH}/lib:${MACA_PATH}/mxgpu_llvm/lib:${LD_LIBRARY_PATH}
+RUN echo $PATH
+RUN apt install -y git
+# 检查并初始化 cu-bridge
+RUN if [ ! -d /root/cu-bridge ]; then \
+        ${MACA_PATH}/tools/cu-bridge/tools/pre_make; \
+    fi
+
+# ======================
+# Step 1: Clone and build original vLLM (for torch setup)
+# ======================
+WORKDIR /workspace
+RUN git clone --branch ${VLLM_VERSION} https://github.com/vllm-project/vllm.git \
+    && cd vllm \
+    && python3 use_existing_torch.py \
+    && pip install -r requirements/build.txt
+
+# ======================
+# Step 2: Build vLLM with empty device (to avoid CUDA dependency)
+# ======================
+RUN cd vllm \
+    && VLLM_TARGET_DEVICE=empty pip install -v . --no-build-isolation
+
+
+# ======================
+# Step 3: Build vLLM-metax
+# ======================
+RUN git clone --branch ${VLLM_METAX_VERSION} https://github.com/MetaX-MACA/vLLM-metax.git \
+    && cd vLLM-metax \
+    && python3 use_existing_metax.py \
+    && pip install -r requirements/build.txt \
+    && python3 -m build -w -n \
+    && pip install dist/*.whl
+
+# ======================
+# Step 4: Clone and patch Megatron-LM
+# ======================
+RUN git clone --branch ${MEGATRON_VERSION} https://github.com/NVIDIA/Megatron-LM.git \
+    && sed -i 's/nvcc/cucc/g' /workspace/Megatron-LM/megatron/legacy/fused_kernels/__init__.py
+
+# ======================
+# Step 5: Clone, patch and install ms-swift
+# ======================
+RUN rm -rf /workspace/ms-swift
+
+RUN git clone --branch ${SWIFT_VERSION} https://github.com/modelscope/ms-swift.git \
+    && sed -i '0,/^\(from \|import \)/{s//import vllm_metax.patch\n&/}' ms-swift/swift/__init__.py \
+    && cd ms-swift \
+    && pip install -r requirements.txt \
+    && pip install .
+
+
+# 默认命令
+CMD ["bash"]
--- a/docker/Metax/4.0/build.sh
+++ b/docker/Metax/4.0/build.sh
@@ -0,0 +1,13 @@
+docker build \
+    --network host \
+    -f Dockerfile.metax \
+    -t swift:v4.0.0 \
+    --build-arg VLLM_VERSION=v0.11.2 \
+    --build-arg VLLM_METAX_VERSION=v0.11.2 \
+    --build-arg MACA_VERSION=3.3.0 \
+    --build-arg MEGATRON_VERSION=core_v0.15.0  \
+    --build-arg SWIFT_VERSION=v4.0.0 \
+    --build-arg TE_VERSION=2.8 \
+    --build-arg CU_BRIDGE_VERSION=3.3.0 \
+    --no-cache \
+    .
--- a/docker/Metax/4.0/build_from_metax_image.sh
+++ b/docker/Metax/4.0/build_from_metax_image.sh
@@ -0,0 +1,11 @@
+docker build \
+    --network host \
+    -f Dockerfile.with_metax_image \
+    -t swift:v4.0.0 \
+    --build-arg VLLM_VERSION=v0.11.2 \
+    --build-arg VLLM_METAX_VERSION=v0.11.2 \
+    --build-arg MEGATRON_VERSION=core_v0.15.0  \
+    --build-arg SWIFT_VERSION=v4.0.0 \
+    --progress=plain \
+    --no-cache \
+     .
--- a/docker/Metax/4.0/override.txt
+++ b/docker/Metax/4.0/override.txt
@@ -0,0 +1,3 @@
+setuptools>=77.0.3,<80
+flash-linear-attention
+mcoplib
--- a/docker/Metax/4.0/requirements_extra.txt
+++ b/docker/Metax/4.0/requirements_extra.txt
@@ -0,0 +1,9 @@
+diffusers==0.35.2
+evalscope
+librosa
+mpi4py
+ms-opencompass
+optimum==1.27.0
+pytorchvideo
+qwen_vl_utils==0.0.14
+timm
--- a/docker/Metax/4.0/swift_building_instructions.md
+++ b/docker/Metax/4.0/swift_building_instructions.md
@@ -0,0 +1,43 @@
+# 1. build swift image from a ubi9 docker image
+    Full build from a minimal base image, use venv virtual enviroment
+## 1.1. build
+    ``` bash
+    bash build.sh
+    ```
+## 1.2. run a container
+    ``` bash
+    docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video  \
+    --shm-size 100gb --ulimit memlock=-1 \
+    --security-opt seccomp=unconfined --security-opt apparmor=unconfined \
+    --device=/dev/dri --device=/dev/mxcd \
+    --name base_image \
+    ${IMAGE_ID} bash
+    ```
+## 1.3. activate venv environment
+    here we use venv rather than conda
+    ``` bash
+    source /opt/venv/bin/activate
+    ```
+## 1.4. run swift examples
+    cd /workspace/ms-swift
+    bash example/train/full/train.sh
+
+# 2. build swift image from metax release image
+    Fast build based on the pre-built Metax release image, use conda virtual enviroment
+## 2.1. build
+    ``` bash
+    bash build_from_metax_image.sh
+    ```
+## 2.2. run a container
+    ``` bash
+    docker run -d -it --net=host --uts=host --ipc=host --privileged=true --group-add video  \
+    --shm-size 100gb --ulimit memlock=-1 \
+    --security-opt seccomp=unconfined --security-opt apparmor=unconfined \
+    --device=/dev/dri --device=/dev/mxcd \
+    --name base_image \
+    ${IMAGE_ID} bash
+    ```
+## 2.3. run swift examples
+    cd /workspace/ms-swift
+    bash example/train/full/train.sh
+    ```
--- a/modelscope/msdatasets/utils/hf_datasets_util.py
+++ b/modelscope/msdatasets/utils/hf_datasets_util.py
@@ -130,40 +130,7 @@ ExpandDatasetProperty_T = Literal[


 # Patch datasets features
-# In datasets 4.0+, the List type is the native feature type;
-# in datasets <4.0, Sequence (a dataclass) serves that role.
-_ListBase = DatasetList if DatasetList is not None else SequenceHf
-
-
-@dataclass(repr=False)
-class ListMs(_ListBase):
-    """Feature type for large list data composed of child feature data type.
-
-    It is backed by `pyarrow.ListType`, which uses 32-bit offsets or a fixed length.
-
-    Args:
-        feature ([`FeatureType`]):
-            Child feature data type of each item within the large list.
-        length (optional `int`, default to -1):
-            Length of the list if it is fixed.
-            Defaults to -1 which means an arbitrary length.
-    """
-
-    feature: Any
-    length: int = -1
-    id: Optional[str] = field(default=None, repr=False)
-    # Automatically constructed
-    pa_type: ClassVar[Any] = None
-    _type: str = field(default='List', init=False, repr=False)
-
-    def __repr__(self):
-        if self.length != -1:
-            return f'{type(self).__name__}({self.feature}, length={self.length})'
-        else:
-            return f'{type(self).__name__}({self.feature})'
-
-
-_FEATURE_TYPES['List'] = ListMs
+_NativeList = DatasetList if DatasetList is not None else SequenceHf


 def generate_from_dict_ms(obj: Any):
@@ -202,9 +169,10 @@ def generate_from_dict_ms(obj: Any):
    if class_type == LargeList:
        feature = obj.pop('feature')
        return LargeList(generate_from_dict_ms(feature), **obj)
-    if class_type == ListMs:
+    # Handle the native List type (datasets 4.0+) as well as Sequence-based
+    if _NativeList is not None and (class_type is _NativeList or issubclass(class_type, _NativeList)):
        feature = obj.pop('feature')
-        return ListMs(generate_from_dict_ms(feature), **obj)
+        return _NativeList(generate_from_dict_ms(feature), **obj)

    field_names = {f.name for f in fields(class_type)}
    return class_type(**{k: v for k, v in obj.items() if k in field_names})
@@ -213,9 +181,30 @@ def generate_from_dict_ms(obj: Any):
 def _download_ms(self, url_or_filename: str, download_config: DownloadConfig) -> str:
    url_or_filename = str(url_or_filename)
    if url_or_filename.startswith('hf://'):
-        # hf:// URLs are handled natively by cached_path via HfApi.hf_hub_download,
-        # which uses config.HF_ENDPOINT (already set to ModelScope endpoint).
-        pass
+        # hf:// URLs (e.g. hf://datasets/{owner}/{name}@{revision}/{file_path})
+        hf_path = url_or_filename[len('hf://'):]
+        # Strip leading resource type prefix (e.g. "datasets/")
+        for _prefix in ('datasets/', 'models/'):
+            if hf_path.startswith(_prefix):
+                hf_path = hf_path[len(_prefix):]
+                break
+        # Extract revision and file_path from "{owner}/{name}@{revision}/{file_path}"
+        if '@' in hf_path:
+            at_idx = hf_path.index('@')
+            after_at = hf_path[at_idx + 1:]
+            slash_idx = after_at.find('/')
+            if slash_idx == -1:
+                revision = after_at
+                file_path = ''
+            else:
+                revision = after_at[:slash_idx]
+                file_path = after_at[slash_idx + 1:]
+        else:
+            parts = hf_path.split('/', 2)
+            revision = DEFAULT_DATASET_REVISION
+            file_path = parts[2] if len(parts) > 2 else ''
+        params = urlencode({'Source': 'SDK', 'Revision': revision, 'FilePath': file_path})
+        url_or_filename = self._base_path + params
    elif is_relative_path(url_or_filename):
        revision = DEFAULT_DATASET_REVISION
        # Note: make sure the FilePath is the last param
--- a/tests/hub/test_download_dataset_file.py
+++ b/tests/hub/test_download_dataset_file.py
@@ -7,6 +7,7 @@ import unittest

 from modelscope.hub.file_download import dataset_file_download
 from modelscope.hub.snapshot_download import dataset_snapshot_download
+from modelscope.utils.test_utils import test_level


 class DownloadDatasetTest(unittest.TestCase):
@@ -14,6 +15,7 @@ class DownloadDatasetTest(unittest.TestCase):
    def setUp(self):
        pass

+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_dataset_file_download(self):
        dataset_id = 'citest/test_dataset_download'
        file_path = 'open_qa.jsonl'
@@ -67,6 +69,7 @@ class DownloadDatasetTest(unittest.TestCase):
            file_modify_time2 = os.path.getmtime(cache_file_path)
            assert file_modify_time == file_modify_time2

+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
    def test_dataset_snapshot_download(self):
        dataset_id = 'citest/test_dataset_download'
        file_path = 'open_qa.jsonl'