diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 7e6e9b77..a8565f16 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,3 +1,5 @@
+exclude: 'modelscope/preprocessors/templates/'
+
 repos:
   - repo: https://github.com/pycqa/flake8.git
     rev: 4.0.0
diff --git a/.pre-commit-config_local.yaml b/.pre-commit-config_local.yaml
index a68a5b78..869d8fd6 100644
--- a/.pre-commit-config_local.yaml
+++ b/.pre-commit-config_local.yaml
@@ -1,3 +1,5 @@
+exclude: 'modelscope/preprocessors/templates/'
+
 repos:
   - repo: /home/admin/pre-commit/flake8
     rev: 4.0.0
diff --git a/modelscope/preprocessors/templates/__init__.py b/modelscope/preprocessors/templates/__init__.py
new file mode 100644
index 00000000..5ac1780d
--- /dev/null
+++ b/modelscope/preprocessors/templates/__init__.py
@@ -0,0 +1,2 @@
+from .base import Template, get_template
+from .template import TemplateType
diff --git a/modelscope/preprocessors/templates/base.py b/modelscope/preprocessors/templates/base.py
new file mode 100644
index 00000000..4504a4bc
--- /dev/null
+++ b/modelscope/preprocessors/templates/base.py
@@ -0,0 +1,1041 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import re
+from copy import deepcopy
+from typing import Any, Dict, List, Literal, Optional, Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from modelscope import get_logger
+from torch.nn import Module
+from torch.nn.utils.rnn import pad_sequence
+from transformers import PreTrainedTokenizerBase, StoppingCriteria
+from .loss_scale import loss_scale_map
+from .tools_prompt import get_tools_prompt
+from .utils import load_batch, load_image, rescale_image, fetch_one, to_device, decode_base64
+from .utils import History, Prompt, StopWords, Context, Messages
+
+logger = get_logger()
+
+DEFAULT_SYSTEM = 'You are a helpful assistant.'
+
+TEMPLATE_MAPPING: Dict[str, Dict[str, Any]] = {}
+
+
+def get_template(
+    template_type: str,
+    tokenizer: PreTrainedTokenizerBase,
+    default_system: Optional[str] = None,
+    max_length: Optional[int] = None,
+    truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+    **kwargs,
+) -> 'Template':
+    template_info = TEMPLATE_MAPPING[template_type]
+    template = deepcopy(template_info['template'])
+    template.init_template(tokenizer, default_system, max_length, truncation_strategy, **kwargs)
+    return template
+
+
+def _findall(token_list: List[int], sub_token_list: Union[int, List[int]]) -> List[int]:
+    """Find the index of a token in the token_list."""
+    if isinstance(sub_token_list, int):
+        sub_token_list = [sub_token_list]
+    res = []
+    idx = -1
+    try:
+        while True:
+            idx = token_list.index(sub_token_list[0], idx + 1)
+            if len(sub_token_list) == 1 or sub_token_list == token_list[idx:idx + len(sub_token_list)]:
+                res.append(idx)
+    except ValueError:
+        pass
+    return res
+
+
+def replace_img_tag(messages: Messages,
+                    replace_token: str,
+                    pattern=r'<img>(.+?)</img>') -> Tuple[str, History, List[str]]:
+    images_path = []
+    new_messages = []
+    for i, m in enumerate(messages):
+        m = m.copy()
+        if m['content'] is None or m['role'] in ('tool', 'system', 'assistant'):
+            new_messages.append(m)
+        else:
+            images_path += re.findall(pattern, m['content'])
+            m['content'] = re.sub(pattern, replace_token, m['content'])
+            new_messages.append(m)
+    return messages, images_path
+
+
+class StopWordsCriteria(StoppingCriteria):
+    """Adding extra stop words in template to prevent unstoppable generation
+        Like suffixes and chat seps in the template.
+    """
+    def __init__(self, tokenizer: PreTrainedTokenizerBase, stop_words: StopWords, **tokenizer_kwargs) -> None:
+        self.tokenizer = tokenizer
+        self.stop_words = stop_words
+        self.tokenizer_kwargs = tokenizer_kwargs
+        self.start_idx = -1
+
+    def __call__(self, input_ids: torch.Tensor, scores: torch.Tensor, **kwargs) -> bool:
+        if self.start_idx == -1:
+            self.start_idx = len(input_ids[0]) - 1
+        tokenizer = self.tokenizer
+        stop_words = self.stop_words
+        # [-20:]: Assuming the end tokens do not exceed 20 tokens,
+        #   to avoid input_ids being too long and affecting efficiency.
+        text = tokenizer.decode(input_ids[0, self.start_idx:][-20:], **self.tokenizer_kwargs)
+        for stop_word in stop_words:
+            if isinstance(stop_word, str):
+                if stop_word in text:
+                    return True
+            else:  # list
+                if len(stop_word) > 0 and input_ids[0].tolist()[-len(stop_word):] == stop_word:
+                    return True
+        return False
+
+
+class Template:
+    """A template class for all supported models.
+
+    Args:
+        prefix: Prefix tokens before the first turn's prompt
+        prompt: A list of elements whose types are str and list of integers. The input query part of every turn.
+        chat_sep: The chat separators between every turn.
+        suffix: The end tokens after the chat finished.
+        default_system: A default system instruction.
+        system_prefix: The prefix if the `system` is not empty.
+        auto_add_bos: By default, the bos_token is not added. The auto_add_bos option will determine
+            whether to add it based on `tokenizer.encode('')`.
+        tools_prompt: The tools prompt name
+        tool_prompt: The tool prompt, usually useful when there is a tool role
+        padding_side: The padding side
+        infer_media_type: The media type supported by the multi-modals
+        Examples:
+            <start_of_output>system\nYou are a helpful assistant!<end_of_output>\n<bos><start_of_output>Who are you?<end_of_output>\n<start_of_output>assistant:I am a robot<end_of_output>\n<start_of_output>Who are you?<end_of_output>\n<start_of_output>assistant:I am a robot<end_of_output> # noqa
+                                     ----------system------------                                       ---query----                                            --response- -----chatsep-----                 ---query---                                             --response- ----suffix-----
+            ----------------------------system_prefix---------------------------- ---------------------------- prompt -------------------------------------                                  ---------------------------- prompt -------------------------------------
+
+    """
+
+    special_tokens = ['<image>', '<video>', '<audio>', '<bbox>', '<ref-object>']
+    special_keys = ['images', 'videos', 'audios', 'objects']
+    grounding_type = 'norm_1000'
+    image_placeholder = ['<image>']
+    load_medias = True
+    compute_per_round_loss = True  # for rlhf
+    output_prompt_answer = False  # for encoder-decoder & kto
+
+    def __init__(self,
+                 prefix: Prompt,
+                 prompt: Prompt,
+                 chat_sep: Optional[Prompt],
+                 suffix: Prompt,
+                 default_system: Optional[str] = None,
+                 system_prefix: Optional[Prompt] = None,
+                 auto_add_bos: bool = False,
+                 tools_prompt: str = 'react_en',
+                 tool_prompt: Optional[Prompt] = None,
+                 padding_side: Literal['left', 'right'] = 'right',
+                 infer_media_type: Literal['interleave', 'dialogue', 'round'] = 'interleave') -> None:
+        # check
+        for x in [prefix, prompt, chat_sep, suffix, system_prefix]:
+            assert x is None or isinstance(x, list)
+
+        if default_system == '':
+            default_system = None
+        if self._has_system(prefix):
+            assert system_prefix is None, 'The prefix already contains {{SYSTEM}}.'
+            system_prefix = prefix
+            prefix = self._replace_system(prefix)
+        self.prefix = prefix
+        self.system_prefix = system_prefix
+        if self.system_prefix is None and not any(['{{SYSTEM}}' in context for context in prompt]):
+            assert default_system is None, 'The template does not support `system`.'
+        self.prompt = prompt
+        self.chat_sep = chat_sep
+        self.support_multi_round = self.chat_sep is not None
+        self.suffix = suffix
+        self.default_system = default_system
+        self.use_default_system = True
+        self.auto_add_bos = auto_add_bos
+        self._is_init = False
+        self.tools_prompt = tools_prompt
+        self.tool_prompt = tool_prompt if tool_prompt is not None else self.prompt  # default as user
+        self.padding_side = padding_side
+        self.infer_media_type = infer_media_type
+
+    @staticmethod
+    def _replace_system(prefix: Prompt) -> Prompt:
+        """Replace system with the """
+        return [p.replace('{{SYSTEM}}', '') for p in prefix if '{{SYSTEM}}' in p]
+
+    @staticmethod
+    def _has_system(prefix: Prompt) -> bool:
+        return any(['{{SYSTEM}}' in p for p in prefix])
+
+    @staticmethod
+    def token_attr_to_id(tokenizer: PreTrainedTokenizerBase, value: Optional[Prompt]) -> Optional[Prompt]:
+        """Turn `eos_token_id` to token id
+
+        e.g. [['eos_token_id']] -> [[2]]
+        """
+        if value is None:
+            return None
+        res_value = []
+        for v in value:
+            if isinstance(v, list):
+                res_v = []
+                for sub_v in v:
+                    if isinstance(sub_v, str):
+                        sub_v = getattr(tokenizer, sub_v)
+                    res_v.append(sub_v)
+                v = res_v
+            res_value.append(v)
+        return res_value
+
+    def init_template(self,
+                       tokenizer: PreTrainedTokenizerBase,
+                       default_system: Optional[str] = None,
+                       max_length: Optional[int] = None,
+                       truncation_strategy: Literal['delete', 'truncation_left'] = 'delete',
+                       loss_scale: str = 'default',
+                       rescale_image: int = -1,
+                       **kwargs) -> None:
+        """Init template by a tokenizer
+        Args:
+            tokenizer: The tokenizer to tokenize the sentence
+            default_system: The default system to use if the dataset does not provide one
+            max_length: Max length of the sequence
+            truncation_strategy: The truncation strategy
+            loss_scale: The loss scale function to use
+            rescale_image: Rescale image to reduce memory usage, default `-1` means no limitation
+        """
+        assert self._is_init is False, 'The template has been initialized.'
+        self._is_init = True
+        self.tokenizer = tokenizer
+        self.is_multimodal = getattr(tokenizer, 'is_multimodal', None)
+        # if default_system is None. not change self.default_system
+        if default_system == '':
+            self.default_system = None
+        elif default_system is not None:
+            assert self.system_prefix is not None, (
+                f'The template does not support `system`, template_type: {getattr(self, "template_type", None)}')
+            self.default_system = default_system
+        self.max_length = max_length
+        self.truncation_strategy = truncation_strategy
+        if isinstance(loss_scale, str):
+            self.loss_scale = loss_scale_map.get(loss_scale, None)
+        else:
+            self.loss_scale = loss_scale
+        self.rescale_image = rescale_image
+
+        for key in ['prefix', 'prompt', 'chat_sep', 'suffix', 'system_prefix']:
+            value = getattr(self, key)
+            value = self.token_attr_to_id(tokenizer, value)
+            setattr(self, key, value)
+
+    def post_encode(self, model: Module, data: Any) -> Dict[str, Any]:
+        """This method will be called after data_collator and before the forward
+        Args:
+            data: The `_data` field from the example batch, this field should be packed manually
+        Returns:
+            Any extra fields need to be passed into the model.forward
+        """
+        return {}
+
+    def check_example(self, example: Dict[str, Any]) -> None:
+        """Check example valid"""
+        pass
+
+    def add_default_tags(self, example: Dict[str, Any]) -> None:
+        """Add default tags to example, this is for the multi-modal datasets
+            1. For the round infer_media_type, this method will check the tag equals with the chat round
+            2. Else, this method will try to add tags to the head of the messages
+        Args:
+            example: The input example
+        """
+        messages = example['messages']
+        for media_key, media_tag in [('videos', '<video>'), ('images', '<image>'), ('audios', '<audio>')]:
+            if example.get(media_key):
+                _messages = [message for message in messages if message['role']!='system']
+                n_round = len(_messages)
+                assert n_round % 2 == 0
+                history = [_messages[i:i+2] for i in range(n_round // 2)]
+                if self.infer_media_type == 'round':
+                    for i, h, m in zip(range(n_round // 2), history, example[media_key]):
+                        num_media_tags = len(re.findall(media_tag, h[0]['content']))
+                        if m:
+                            assert num_media_tags <= 1, (
+                                'The model includes at most one media per round. However, '
+                                f'this round contains {num_media_tags} media_tags. query: {h[0]}')
+                            if num_media_tags == 0:
+                                h[0]['content'] = media_tag + h[0]['content']
+                        else:
+                            assert num_media_tags == 0, f'Missing media. query: {h[0]}'
+                    example[media_key] = [m for m in example[media_key] if m]
+                else:
+                    num_media_tags = len(re.findall(media_tag, '\n'.join([h[0]['content'] for h in history])))
+                    example[media_key] = [m for m in example[media_key] if m]
+                    num_media = len(example[media_key])
+                    num_new_tags = num_media - num_media_tags
+                    assert num_new_tags >= 0, f'Number of media: {num_media}, number of media_tags: {num_media_tags}'
+                    history[0][0]['content'] = media_tag * num_new_tags + history[0][0]['content']
+
+    def replace_media_tags(self, example) -> None:
+        """Replace the <img></img> with the images key and <image> tag
+
+        Args:
+            example: The input example
+        """
+        # Parse <img></img> format images and merged into images key
+        if self.is_multimodal in {True, None}:  # If False, do not perform replace_img_tag
+            example['messages'], images_path = replace_img_tag(
+                example.get('messages'), '<image>')
+
+            if example.get('images') and images_path:
+                raise ValueError('Do not mix use the <img></img> tag and <image> tag.')
+            example['images'] = example.get('images') or [] + images_path
+
+        # audio, video
+        if self.is_multimodal in {True, None}:
+            for k, tag, pattern in zip(['audios', 'videos'], ['<audio>', '<video>'],
+                                       [r'<audio>(.+?)</audio>', r'<video>(.+?)</video>']):
+                example['messages'], medias_path = replace_img_tag(
+                    example.get('messages'), tag, pattern)
+
+                example[k] = example.get(k) or [] + medias_path
+
+    def _preprocess_media(self, example):
+        """Preprocess multi-modal media resources in one example
+            1. Wrap all values in media keys to list
+            2. Replace <img></img> tags
+            3. Add or check missing tags to examples
+            4. Parse the string field in the `objects` field to jsons
+            5. Load images if needed
+        Args:
+            example: The input example
+        """
+        multimodal_keys = {
+            'audio': 'audios',
+            'image': 'images',
+            'video': 'videos',
+        }
+        # Format media_keys to list
+        for media_key in multimodal_keys.values():
+            if example.get(media_key) and not isinstance(example[media_key], (tuple, list)):
+                # change images field to list
+                example[media_key] = [example[media_key]]
+
+        self.replace_media_tags(example)
+        # Add default tags to examples to note where to put the medias into the sequence
+        self.add_default_tags(example)
+
+        # Format objects(groundings/refs) to json
+        if example.get('objects') and isinstance(example['objects'], str):
+            # reload grounding from str
+            example['objects'] = json.loads(example['objects'])
+            objects = []
+            for object in example['objects']:
+                # Compatible with list format
+                if isinstance(object, list):
+                    object = {
+                        'caption': object[0],
+                        'bbox': object[1],
+                        'bbox_type': None,
+                        'image': 0,
+                    }
+                objects.append(object)
+            example['objects'] = objects
+
+        # Load image into PIL format
+        images = example.get('images') or []
+        if images:
+            if example.get('objects') or self.load_medias:
+                images = load_batch(images, load_image)  # base64/local_path -> PIL.Image
+            if example.get('objects'):
+                # Normalize grounding bboxes
+                self.normalize_bbox(example['objects'], images, to_type=self.grounding_type)
+            if self.load_medias and self.grounding_type != 'real':
+                images = [rescale_image(img, self.rescale_image) for img in images]
+            if not self.load_medias:  # fix pt & qwen-vl
+                images = decode_base64(images=images)['images']  # PIL.Image/base64 -> local_path
+            example['images'] = images
+
+    def preprocess(self, example):
+        # Duplicate example and create a new one to prepare in-place changes
+        example = example.copy()
+        template_type: Optional[str] = getattr(self, 'template_type', None)
+        tools: Union[List[Any], str] = example.get('tools') or []
+
+        # Template needs to be initialized
+        if not self._is_init:
+            raise ValueError(
+                'Template is not initialized, please use the `get_template` function to obtain the template.')
+
+        messages = example['messages']
+        system_round = [message for message in messages if message['role'] == 'system']
+        messages = [message for message in messages if message['role'] != 'system']
+        # Reset system (by default value and agent tools)
+        system: Optional[str] = system_round[0]['content'] if system_round else ''
+        if not system:
+            if self.use_default_system:
+                system = self.default_system
+        else:
+            assert self.system_prefix is not None, (
+                f'The template does not support `system`, template_type: {template_type}')
+        if tools:
+            if isinstance(tools, str):
+                tools = json.loads(tools)
+            if system is None:
+                system = ''
+            system += get_tools_prompt(tools, self.tools_prompt)
+
+        if system:
+            if not system_round:
+                system_round = [{'role': 'system', 'content': None}]
+            system_round[0]['content'] = system
+
+        if len(messages) > 1:
+            assert self.support_multi_round, (
+                f'The template does not support multi-round chat, template_type: {template_type}')
+        example['messages'] = system_round + messages
+        self._preprocess_media(example)
+        # Check the example that whether matching the very template's rules
+        self.check_example(example)
+        return example
+
+    def encode(self, example: Dict[str, Any], streaming: bool = False, is_training: bool = False, **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """The entrance method of Template!
+
+        Args:
+            example: The input example
+            streaming: If is streaming mode
+            is_training: Use template in training
+            **kwargs:
+                model: The model instance, use only in `is_training=False`
+        Returns:
+            if not streaming mode, returns tuple of (example, tokenizer_kwargs), else return example only
+        """
+        example = self.preprocess(example)
+        res = self._encode(example, **kwargs)
+        inputs = res[0]
+        if not is_training and '_data' in inputs:
+            model = kwargs.get('model')
+            assert model is not None
+            data = inputs.pop('_data')
+            data = to_device(data, model.device)
+            inputs.update(self.post_encode(model, data))
+        return res if not streaming else inputs
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """return: inputs, tokenizer_kwargs"""
+        messages = example['messages']
+        is_multi_modal: bool = any([example.get(key) for key in Template.special_keys])
+
+        inputs, tokenizer_kwargs = self._concat_and_tokenize(
+            messages,
+            self.truncation_strategy,
+            auto_add_bos=self.auto_add_bos,
+            is_multi_modal=is_multi_modal,
+            example=example)
+        if inputs.get('labels') is None:
+            inputs.pop('loss_scale', None)
+        return inputs, tokenizer_kwargs
+
+    def _concat_context_list(
+            self,
+            context_list: List[Context],
+            res_context_list: List[Context],  # inplace
+            loss_scale_list: List[float],  # inplace
+            system: Optional[str] = None,
+            query: Optional[str] = None,
+            response: Optional[str] = None,
+            round0: Optional[int] = None,
+            compute_loss: bool = True) -> None:
+        """Concat context list and replace placeholder"""
+        round1 = None
+        if round0 is not None:
+            round1 = str(round0 + 1)
+            round0 = str(round0)
+        for context in context_list:
+            if isinstance(context, str):
+                if '{{RESPONSE}}' == context:
+                    assert response is not None
+                    if compute_loss:
+                        content_part, weight_part = self.loss_scale(query, response)
+                    else:
+                        content_part, weight_part = [response], [0.]
+                    res_context_list.extend(content_part)
+                    loss_scale_list.extend(weight_part)
+                    continue
+                old_str_list = ['{{SYSTEM}}', '{{QUERY}}', '{{ROUND0}}', '{{ROUND1}}']
+                new_str_list = [system, query, round0, round1]
+                for (old_str, new_str) in zip(old_str_list, new_str_list):
+                    if new_str is not None and old_str in context:
+                        context = context.replace(old_str, new_str)
+            if len(context) == 0:
+                continue
+            res_context_list.append(context)
+            loss_scale_list.append(0.)
+
+    def _simplify_context_list(self, context_list: List[Context], loss_scale_list: List[float],
+                               **kwargs) -> Tuple[List[Context], List[float]]:
+        """Merge anything in the context to simplify the inputs"""
+        is_multi_modal: bool = kwargs.pop('is_multi_modal', False)
+
+        if is_multi_modal:
+            context_list, loss_scale_list = self.split_special_tokens(context_list, loss_scale_list)
+        context_list, loss_scale_list = self.pre_tokenize(context_list, loss_scale_list, **kwargs)
+
+        res: List[Context] = []  # result of context_list
+        res_loss_scale: List[float] = []  # result of loss_scale_list
+        temp: List[str] = []
+        temp_loss_scale = 0.
+        for i, (context, loss_scale) in enumerate(zip(context_list, loss_scale_list)):
+            if isinstance(context, str) and (loss_scale == temp_loss_scale):
+                temp.append(context)
+            else:
+                if len(temp) > 0:
+                    res.append(''.join(temp))
+                    res_loss_scale.append(temp_loss_scale)
+                    temp.clear()
+                if isinstance(context, str):  # loss_scale diff
+                    temp.append(context)
+                else:
+                    res.append(context)
+                    res_loss_scale.append(loss_scale)
+                temp_loss_scale = loss_scale
+        if len(temp) > 0:
+            res.append(''.join(temp))
+            res_loss_scale.append(temp_loss_scale)
+
+        return res, res_loss_scale
+
+    @staticmethod
+    def split_special_tokens(context_list: List[Context],
+                             loss_scale_list: List[float]) -> Tuple[List[Context], List[float]]:
+        """Split special tokens, for example `<image>`, `<video>`, this will help the replace_tag operation"""
+        from .utils import split_str_parts_by
+        res: List[Context] = []
+        loss_scale_res: List[float] = []
+        for context, loss_scale in zip(context_list, loss_scale_list):
+            contexts = []
+            if isinstance(fetch_one(context), str):
+                for d in split_str_parts_by(context, Template.special_tokens):
+                    contexts.extend([d['key'], d['content']])
+                contexts = [c for c in contexts if c]
+                res.extend(contexts)
+                loss_scale_res.extend([loss_scale] * len(contexts))
+            else:
+                res.append(context)
+                loss_scale_res.append(loss_scale)
+        return res, loss_scale_res
+
+    def _tokenize(self, context, **tokenizer_kwargs):
+        return self.tokenizer(
+            context, return_attention_mask=False, add_special_tokens=False, **tokenizer_kwargs)['input_ids']
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        """Override this function to do your own replace operation.
+
+        This method is used to replace standard tags like `<image>` to some tokens that the model needs.
+
+        Args:
+            media_type: The modal.
+            index: The index of the medias, for example 0 represents the first elements in `images`
+            example: The input example
+
+        Returns:
+            The content or input_ids after replacement.
+        """
+        if media_type == 'image':
+            return self.image_placeholder
+        elif media_type == 'video':
+            return ['<video>']
+        elif media_type == 'audio':
+            return ['<audio>']
+
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        """Replace objects referenced by the bbox to contents or input_ids. This is useful in the grounding task.
+        Override this function to do your own replace operation.
+
+        Args:
+            index: The index in the `objects` key
+            example: The input example
+
+        Returns:
+            The contents or input_ids replaced
+        """
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            return [object_['caption']]
+        else:
+            return ['<ref-object>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        """Replace bbox pointing to the objects to contents or input_ids. This is useful in the grounding task.
+        Override this function to do your own replace operation.
+
+        Args:
+            index: The index in the `objects` key
+            example: The input example
+
+        Returns:
+            The contents or input_ids replaced
+        """
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            if isinstance(object_['bbox'][0], list):
+                all_objects = ''
+                for sub_object in object_['bbox']:
+                    all_objects += f'[({sub_object[0]},{sub_object[1]}),' f'({sub_object[2]},{sub_object[3]})],'
+                all_objects = all_objects[:-1]
+                return [all_objects]
+            else:
+                return [f'[({object_["bbox"][0]},{object_["bbox"][1]}),({object_["bbox"][2]},{object_["bbox"][3]})]']
+        else:
+            return ['<bbox>']
+
+    @classmethod
+    def normalize_bbox(cls, objects: List[Dict[str, Any]], images: List[Any],
+                       to_type: Literal['real', 'norm_1000', 'norm_1']) -> None:
+        """Normalize bbox to needed.
+        to_type support real/norm_1000/norm_1, which literally means the coordinates in real, or normalized by 1000,
+            or normalized by 1.
+
+        Args:
+            objects: The objects containing the bbox
+            images: The images list
+            to_type: The coordinate type needed by the model.
+        """
+        if not objects or not images:
+            return
+
+        for object in objects:
+            bbox = object['bbox']
+            bbox_type = object['bbox_type']
+            idx = object['image']
+            image = images[idx]
+            if bbox_type == 'real':
+                if to_type == 'real':
+                    continue
+                width, height = image.width, image.height
+                if isinstance(bbox[0], list):
+                    bboxes = []
+                    for _box in bbox:
+                        bboxes.append([
+                            int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim
+                            for coord, dim in zip(_box, [width, height, width, height])
+                        ])
+                    object['bbox'] = bboxes
+                else:
+                    object['bbox'] = [
+                        int(coord / dim * 999) if to_type == 'norm_1000' else coord / dim
+                        for coord, dim in zip(bbox, [width, height, width, height])
+                    ]
+                object['bbox_type'] = to_type
+            elif bbox_type == 'norm_1000':
+                if to_type == 'norm_1000':
+                    continue
+                if to_type == 'norm_1':
+                    object['bbox'] = [coord / 999. for coord in bbox]
+                elif to_type == 'real':
+                    width, height = image.width, image.height
+                    object['bbox'] = [
+                        int(coord / 999. * dim) for coord, dim in zip(bbox, [width, height, width, height])
+                    ]
+                object['bbox_type'] = to_type
+            elif bbox_type == 'norm_1':
+                if to_type == 'norm_1':
+                    continue
+                if to_type == 'norm_1000':
+                    object['bbox'] = [int(coord * 999) for coord in bbox]
+                elif to_type == 'real':
+                    width, height = image.width, image.height
+                    object['bbox'] = [int(coord * dim) for coord, dim in zip(bbox, [width, height, width, height])]
+                object['bbox_type'] = to_type
+
+    def pre_tokenize(self, context_list: List[Context], loss_scale_list: List[float],
+                     **kwargs) -> Tuple[List[Context], List[float]]:
+        """This method happens before tokenization, replace standard tags to the contents or input_ids needed by
+        the model.
+
+        Args:
+            context_list: The content list
+            loss_scale_list: The loss scale list
+        Returns:
+            The context_list and loss_scale_list after replacement.
+        """
+        example = kwargs.get('example')  # get x_index
+        res: List[Context] = []  # result of context_list
+        res_loss_scale: List[float] = []  # result of loss_scale_list
+
+        for k in ['image', 'video', 'audio']:
+            example[f'{k}_index'] = 0
+
+        for context, loss_scale in zip(context_list, loss_scale_list):
+            for k in ['image', 'video', 'audio']:
+                if context == f'<{k}>':
+                    c_list = self.replace_tag(k, example[f'{k}_index'], example)
+                    example[f'{k}_index'] += 1
+                    break
+            else:
+                if context == '<ref-object>':
+                    c_list = self.replace_object(example.get('object_index', 0), example)
+                    example['object_index'] = example.get('object_index', 0) + 1
+                elif context == '<bbox>':
+                    c_list = self.replace_box(example.get('box_index', 0), example)
+                    example['box_index'] = example.get('box_index', 0) + 1
+                else:
+                    c_list = [context]
+            res += c_list
+            res_loss_scale += [loss_scale] * len(c_list)
+        return res, res_loss_scale
+
+    def _encode_context_list(
+            self,
+            context_list: List[Context],
+            loss_scale_list: Optional[List[float]] = None) -> Tuple[List[int], List[int], List[float], Dict[str, Any]]:
+        """return: input_ids, labels, tokenizer_kwargs"""
+        input_ids: List[int] = []
+        labels: List[int] = []
+        loss_scale: List[float] = []
+        tokenizer_kwargs = {}
+        if loss_scale_list is None:
+            loss_scale_list = [0.] * len(context_list)
+        for i, (context, loss_weight) in enumerate(zip(context_list, loss_scale_list)):
+            if isinstance(context, str):
+                # tokenizer_kwargs is the returned tokenizer_kwargs,
+                # while curr_tokenizer_kwargs is the tokenizer_kwargs for the current context.
+                curr_tokenizer_kwargs = self._get_tokenizer_kwargs(context)
+                self._concat_tokenizer_kwargs(tokenizer_kwargs, curr_tokenizer_kwargs)
+                token_list = self._tokenize(context, **curr_tokenizer_kwargs)
+            else:
+                token_list = context
+            input_ids += token_list
+            if loss_scale_list[i] > 0.0:
+                labels += token_list
+            else:
+                labels += [-100] * len(token_list)
+            loss_scale.extend([loss_weight] * len(token_list))
+        return input_ids, labels, loss_scale, tokenizer_kwargs
+
+    @staticmethod
+    def use_dynamic_eos(labels: List[int], suffix_tokens_id: List[int]) -> None:
+        suffix_len = len(suffix_tokens_id)
+        start = 0
+        for i in range(1, len(labels)):
+            if labels[i - 1] >= 0 and labels[i] == -100:
+                start = i
+            if start > 0 and labels[i - 1] == -100 and labels[i] >= 0:
+                # [0, 1, 2, -100(start), -100, 3(i), 4]
+                length = i - start
+                if length >= suffix_len:
+                    labels[start:start + suffix_len] = suffix_tokens_id
+
+    def _concat_and_tokenize(self,
+                             messages: List[Dict[str, str]],
+                             truncation_strategy: str,
+                             auto_add_bos: bool = False,
+                             **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        """
+        return: inputs, tokenizer_kwargs
+        """
+        system = [message for message in messages if message['role'] == 'system']
+        messages = [message for message in messages if message['role'] != 'system']
+        if len(system) > 0:
+            system = system[0]['content']
+        else:
+            system = None
+
+        assert len(messages) >= 1
+        if len(messages) == 1:
+            if messages['role'] == 'response':
+                history = [None, messages['content']]
+                history_roles = [None, messages['role']]
+            else:
+                history = [messages['content'], None]
+                history_roles = [messages['role'], None]
+        else:
+            assert len(messages) % 2 == 0
+            history = [[messages[i]['content'], messages[i+1]['content']] for i in range(len(messages) // 2)]
+            history_roles = [[messages[i]['role'], messages[i + 1]['role']] for i in range(len(messages) // 2)]
+
+        res_context_list: List[Context] = []
+        loss_scale_list: List[float] = []
+        if auto_add_bos:
+            bos_token_id = self.tokenizer.bos_token_id
+            if isinstance(bos_token_id, int) and bos_token_id in self.tokenizer.encode(''):
+                res_context_list.append([bos_token_id])
+                loss_scale_list.append(0.)
+        prompt = self.prompt.copy()
+        if system is None:
+            prompt = [context for context in prompt if '{{SYSTEM}}' not in context]
+        if system is None or any(['{{SYSTEM}}' in context for context in prompt]):
+            prefix = self.prefix
+        else:
+            prefix = self.system_prefix
+        self._concat_context_list(prefix, res_context_list, loss_scale_list, system=system)
+
+        for i, ((q, r), (qr, rr)) in enumerate(zip(history, history_roles)):
+            context_list = self.tool_prompt.copy() if qr == 'tool' else prompt.copy()
+            extra_context_list = []
+            is_suffix = False
+            if i < len(history) - 1:
+                context_list = [context for context in context_list if '{{SYSTEM}}' not in context]
+                context_list.append('{{RESPONSE}}')
+                if history[i + 1][0]:
+                    extra_context_list = self.chat_sep
+            elif r is not None:
+                # last response
+                context_list.append('{{RESPONSE}}')
+                extra_context_list = self.suffix
+                is_suffix = True
+            if q or r:
+                self._concat_context_list(
+                    context_list,
+                    res_context_list,
+                    loss_scale_list,
+                    query=q,
+                    response=r,
+                    system=system,
+                    round0=i,
+                    compute_loss=self.compute_per_round_loss or is_suffix)
+                res_context_list += extra_context_list
+                loss_scale_list += ([1.] if is_suffix else [0.]) * len(extra_context_list)
+        inputs = {}
+        if self.output_prompt_answer:
+            # tokenizer_kwargs: use prompt
+            answer_len = len(extra_context_list) + bool(history[-1][-1] is not None)
+            total_len = len(res_context_list)
+            for key, _slice in zip(['answer', 'prompt'],
+                                   [slice(total_len - answer_len, total_len),
+                                    slice(0, total_len - answer_len)]):
+                _res_context_list, _loss_scale_list = self._simplify_context_list(res_context_list[_slice],
+                                                                                  loss_scale_list[_slice], **kwargs)
+                input_ids, labels, loss_scale, tokenizer_kwargs = self._encode_context_list(
+                    _res_context_list, _loss_scale_list)
+                inputs[f'{key}_input_ids'], inputs[f'{key}_labels'] = input_ids, labels
+                if self.loss_scale:
+                    inputs[f'{key}_loss_scale'] = loss_scale
+            input_ids = inputs['prompt_input_ids'] + inputs['answer_input_ids']
+            labels = inputs['prompt_labels'] + inputs['answer_labels']
+            if history[-1][-1] is None:
+                assert len(inputs['answer_labels']) == 0
+                inputs['answer_labels'] = None
+
+        else:
+            res_context_list, loss_scale_list = self._simplify_context_list(res_context_list, loss_scale_list, **kwargs)
+            input_ids, labels, loss_scale, tokenizer_kwargs = self._encode_context_list(
+                res_context_list, loss_scale_list)
+            if labels is not None:
+                self.use_dynamic_eos(labels, self._encode_context_list(self.suffix)[0])
+
+        if history[-1][-1] is None:
+            labels = None
+
+        if self.max_length is not None:
+            if truncation_strategy == 'delete' and len(input_ids) > self.max_length:
+                logger.warn(f'Current length of row({len(input_ids)}) is larger'
+                            f' than the max_length({self.max_length}), deleted.')
+                return {}, {}
+            input_ids = input_ids[-self.max_length:]
+            if labels is not None:
+                labels = labels[-self.max_length:]
+            if loss_scale is not None:
+                loss_scale = loss_scale[-self.max_length:]
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+
+        if self.loss_scale:
+            inputs['loss_scale'] = loss_scale
+        return inputs, tokenizer_kwargs
+
+    def _get_tokenizer_kwargs(self, context: str) -> Dict[str, Any]:
+        """return: curr_tokenizer_kwargs"""
+        return {}
+
+    def _concat_tokenizer_kwargs(self, tokenizer_kwargs: Dict[str, Any], curr_tokenizer_kwargs: Dict[str, Any]) -> None:
+        assert len(tokenizer_kwargs) == 0
+
+    @staticmethod
+    def pad_sequence(sequences: List[torch.Tensor],
+                     padding_value: float = 0.,
+                     padding_side: Literal['right', 'left'] = 'right') -> torch.Tensor:
+        """Pad sequence by some side
+
+        Args:
+            sequences: The input sequences in tensor.
+            padding_value: The padding value
+            padding_side: The padding side
+
+        Returns:
+            A tensor after padding
+        """
+        padding_right = padding_side == 'right'
+        if padding_right:
+            return pad_sequence(sequences, batch_first=True, padding_value=padding_value)
+
+        max_len = max([s.size(0) for s in sequences])
+
+        padded_sequences = []
+        for seq in sequences:
+            pad_length = max_len - seq.size(0)
+            pad_tuple = [0] * ((seq.dim() - 1) * 2) + [pad_length, 0]
+            padded_seq = F.pad(seq, tuple(pad_tuple), 'constant', padding_value)
+            padded_sequences.append(padded_seq)
+
+        return torch.stack(padded_sequences)
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        """
+        Args:
+            batch(`List[Dict[str, Any]]`): The input data in batch
+            padding_to(`int`, optional): Whether padding the batch to a fixed length, if none, the batch
+                will be padded to the `longest`
+        """
+        tokenizer = self.tokenizer
+        assert tokenizer.pad_token_id is not None
+        padding_right = self.padding_side == 'right'
+        res = {}
+
+        if 'inputs_embeds' in batch[0]:
+            inputs_embeds = [b['inputs_embeds'] for b in batch]
+            res['inputs_embeds'] = inputs_embeds
+            res['attention_mask'] = [
+                torch.ones((inputs_embeds[i].shape[0]), dtype=torch.int64) for i in range(len(inputs_embeds))
+            ]
+        elif 'input_ids' in batch[0]:
+            input_ids = [torch.tensor(b['input_ids']) for b in batch]
+            res['input_ids'] = input_ids
+            res['attention_mask'] = [torch.ones(len(input_ids[i]), dtype=torch.int64) for i in range(len(input_ids))]
+
+        for key in ['labels', 'loss_scale', 'position_ids']:
+            if key in batch[0]:
+                res[key] = [torch.tensor(b[key]) for b in batch]
+
+        if padding_to is not None:
+            assert 'input_ids' in res
+            padding_len = padding_to - res['input_ids'][0].shape[-1]
+            if padding_len > 0:
+                for key, value in zip(['input_ids', 'attention_mask', 'labels', 'loss_scale', 'position_ids'],
+                                      [tokenizer.pad_token_id, 0, -100, 0., -1]):
+                    if key in res:
+                        res[key][0] = F.pad(res[key][0], (0, padding_len) if padding_right else (padding_len, 0),
+                                            'constant', value)
+        for key, value in zip(['input_ids', 'inputs_embeds', 'attention_mask', 'labels', 'loss_scale', 'position_ids'],
+                              [tokenizer.pad_token_id, 0., 0, -100, 0., -1]):
+            if key in res:
+                res[key] = self.pad_sequence(res[key], value, self.padding_side)
+
+        if '_data' in batch[0]:
+            res['_data'] = [b['_data'] for b in batch]
+        # multimodal
+        pixel_values = [b['pixel_values'] for b in batch if b.get('pixel_values') is not None]
+        if len(pixel_values) > 0:
+            res['pixel_values'] = torch.concat(pixel_values)
+
+            image_sizes = [b['image_sizes'] for b in batch if b.get('image_sizes') is not None]
+            if len(image_sizes) > 0:
+                res['image_sizes'] = torch.concat(image_sizes)
+
+        pixel_values_videos = [b['pixel_values_videos'] for b in batch if b.get('pixel_values_videos') is not None]
+        if len(pixel_values_videos) > 0:
+            res['pixel_values_videos'] = torch.concat(pixel_values_videos)
+        return res
+
+    @classmethod
+    def get_generate_ids(cls, generate_ids: torch.Tensor, input_token_len: int) -> List[int]:
+        if isinstance(generate_ids, torch.Tensor):
+            generate_ids = generate_ids.tolist()
+        if len(generate_ids) >= 1 and isinstance(generate_ids[0], (list, tuple)):
+            generate_ids = generate_ids[0]
+        return cls._get_generate_ids(generate_ids, input_token_len)
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids[input_token_len:]
+
+    @staticmethod
+    def _is_chinese_char(cp: int) -> bool:
+        """Checks whether CP is the codepoint of a CJK character."""
+        # copy from transformers.generation.streamers.TextStreamer
+        if ((0x4E00 <= cp <= 0x9FFF) or (0x3400 <= cp <= 0x4DBF) or (0x20000 <= cp <= 0x2A6DF)
+                or (0x2A700 <= cp <= 0x2B73F) or (0x2B740 <= cp <= 0x2B81F) or (0x2B820 <= cp <= 0x2CEAF)
+                or (0xF900 <= cp <= 0xFAFF) or (0x2F800 <= cp <= 0x2FA1F)):
+            return True
+
+        return False
+
+    @classmethod
+    def _get_safe_print_idx(cls, response: str, print_idx: int, is_finished: bool = False) -> int:
+        if is_finished:
+            return len(response)
+        if response.endswith('\n') or len(response) > 0 and cls._is_chinese_char(ord(response[-1])):
+            print_idx = len(response)
+        else:
+            print_idx = max(response.rfind(' ') + 1, print_idx)
+        return print_idx
+
+    def generate_ids_to_response(
+        self,
+        generate_ids: List[int],
+        is_finished: bool = True,
+        *,
+        tokenizer_kwargs: Optional[Dict[str, Any]] = None,
+        # only stream=True
+        return_delta: bool = False,
+        print_idx: Optional[List[int]] = None,
+        first_num_space: Optional[List[int]] = None,
+    ):
+        if tokenizer_kwargs is None:
+            tokenizer_kwargs = {}
+        tokenizer = self.tokenizer
+        if hasattr(generate_ids, 'tolist'):
+            generate_ids = generate_ids.tolist()
+        # avoid printing template.suffix[-1])
+        if isinstance(self.suffix[-1], list) and (not is_finished or is_finished
+                                                  and generate_ids[-len(self.suffix[-1]):] == self.suffix[-1]):
+            generate_ids = generate_ids[:-len(self.suffix[-1])]
+        if not is_finished or is_finished and generate_ids[-1:] == [self.tokenizer.eos_token_id]:
+            generate_ids = generate_ids[:-1]
+        response = tokenizer.decode(generate_ids, **tokenizer_kwargs)
+        if first_num_space is not None:
+            # Avoid the occurrence of repeated words in sentence.
+            res_fns = first_num_space  # res_first_num_space
+            first_num_space = first_num_space[0]
+            cur_num_space = len(response) - len(response.lstrip(' '))
+            if not is_finished and first_num_space == -1:
+                first_num_space = cur_num_space
+                res_fns[0] = first_num_space
+            if cur_num_space < first_num_space:
+                response = ' ' * (first_num_space - cur_num_space) + response
+            elif cur_num_space > first_num_space:
+                response = response[cur_num_space - first_num_space:]
+        if isinstance(self.suffix[-1],
+                      str) and (not is_finished or is_finished and response[-len(self.suffix[-1]):] == self.suffix[-1]):
+            idx = max(len(response) - len(self.suffix[-1]), 0)
+            # To avoid response length being shorter than previous response length during streaming.
+            if print_idx is not None:
+                idx = max(idx, print_idx[0])
+            response = response[:idx]
+
+        if print_idx is not None:
+            old_print_idx = print_idx[0]
+            if not is_finished:
+                # avoid printing incomplete words
+                print_idx[0] = self._get_safe_print_idx(response, print_idx[0])
+                response = response[:print_idx[0]]
+            if return_delta:
+                response = response[old_print_idx:]
+        else:
+            assert is_finished and not return_delta
+        return response
+
+    def post_process_generate_response(self, response: str, example: dict) -> str:
+        return response
diff --git a/modelscope/preprocessors/templates/loader.py b/modelscope/preprocessors/templates/loader.py
new file mode 100644
index 00000000..e3ed9f89
--- /dev/null
+++ b/modelscope/preprocessors/templates/loader.py
@@ -0,0 +1,371 @@
+import re
+from dataclasses import dataclass
+from typing import Any, Dict, List
+
+import requests
+
+from modelscope import AutoTokenizer, get_logger, snapshot_download
+from . import TemplateType
+from .base import Template, get_template
+
+logger = get_logger()
+
+
+@dataclass
+class TemplateInfo:
+
+    template: str = None
+    template_regex: str = None
+    modelfile_link: str = None
+
+
+def cases(*names):
+    ret = []
+    for name in names:
+        regex = ''
+        for letter in name:
+            if letter.upper() != letter.lower():
+                regex += f'[{letter.upper()}{letter.lower()}]'
+            else:
+                regex += letter
+        ret.append(regex)
+    if len(ret) > 1:
+        ret = '|'.join(ret)
+        ret = '(' + ret + ')'
+    else:
+        ret = ret[0]
+    return ret
+
+
+chat_suffix = cases('instruct', 'chat', '-rl', '-it')
+
+
+def no(*names):
+    return f'(?!.*{cases(*names)})'
+
+
+def no_multi_modal():
+    return no('audio', 'video', 'vl', 'vision')
+
+
+template_info = [
+    # llama
+    TemplateInfo(
+        template=TemplateType.llama3,
+        template_regex=
+        f'.*{cases("llama3", "llama-3")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/llama-3.modelfile',
+    ),
+    TemplateInfo(
+        template=TemplateType.llama,
+        template_regex=
+        f'.*{cases("llama2", "llama-2", "mistral", "codestral", "mixtral")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # qwen
+    TemplateInfo(
+        template=TemplateType.qwen,
+        template_regex=f'.*{cases("qwen")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/qwen2.modelfile',
+    ),
+
+    # codeqwen1.5
+    TemplateInfo(
+        template_regex=
+        f'.*{cases("codeqwen1.5", "codeqwen-1.5")}.*{chat_suffix}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/codeqwen1.5.modelfile',
+    ),
+
+    # chatml
+    TemplateInfo(
+        template=TemplateType.chatml,
+        template_regex=
+        f'.*{cases("yi")}{no_multi_modal()}{no("coder")}.*{chat_suffix}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-1.5.modelfile',
+    ),
+
+    # chatml
+    TemplateInfo(
+        template=TemplateType.chatml,
+        template_regex=f'.*{cases("minicpm")}{no("-v")}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/yi-1.5.modelfile'
+    ),
+
+    # chatglm
+    TemplateInfo(
+        template=TemplateType.chatglm2,
+        template_regex=f'.*{cases("chatglm2")}{no_multi_modal()}.*'),
+    TemplateInfo(
+        template=TemplateType.chatglm3,
+        template_regex=f'.*{cases("chatglm3")}{no_multi_modal()}.*'),
+    TemplateInfo(
+        template=TemplateType.chatglm4,
+        template_regex=f'.*{cases("glm4")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/glm4.modelfile',
+    ),
+
+    # baichuan
+    TemplateInfo(
+        template=TemplateType.baichuan,
+        template_regex=
+        f'.*{cases("baichuan")}{no_multi_modal()}.*{chat_suffix}.*'),
+
+    # codegeex
+    TemplateInfo(
+        template=TemplateType.codegeex4,
+        template_regex=f'.*{cases("codegeex4")}{no_multi_modal()}.*'),
+
+    # idefics3
+    TemplateInfo(
+        template=TemplateType.idefics3,
+        template_regex=f'.*{cases("idefics3")}{no_multi_modal()}.*'),
+
+    # mistral-nemo
+    TemplateInfo(
+        template=TemplateType.mistral_nemo,
+        template_regex=f'.*{cases("Mistral-Nemo")}{no_multi_modal()}.*',
+        modelfile_link='https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/mistral-nemo.modelfile'),
+
+    # internlm
+    TemplateInfo(
+        template=TemplateType.internlm,
+        template_regex=
+        f'.*{cases("internlm")}{no("internlm2", "internlm3")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # internlm2
+    TemplateInfo(
+        template=TemplateType.internlm2,
+        template_regex=
+        f'.*{cases("internlm2")}{no_multi_modal()}.*{chat_suffix}.*'),
+
+    # yi-coder
+    TemplateInfo(
+        template=TemplateType.yi_coder,
+        template_regex=f'.*{cases("yi")}.*{cases("coder")}.*{chat_suffix}.*'),
+
+    # yuan
+    TemplateInfo(
+        template=TemplateType.yuan,
+        template_regex=f'.*{cases("Yuan")}{no_multi_modal()}.*'),
+
+    # xverse
+    TemplateInfo(
+        template=TemplateType.xverse,
+        template_regex=f'.*{cases("xverse")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # skywork
+    TemplateInfo(
+        template=TemplateType.skywork,
+        template_regex=
+        f'.*{cases("skywork")}{no_multi_modal()}.*{chat_suffix}.*'),
+
+    # bluelm
+    TemplateInfo(
+        template=TemplateType.bluelm,
+        template_regex=f'.*{cases("bluelm")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # zephyr
+    TemplateInfo(
+        template=TemplateType.zephyr,
+        template_regex=f'.*{cases("zephyr")}{no_multi_modal()}.*'),
+
+    # deepseek
+    TemplateInfo(
+        template=TemplateType.deepseek,
+        template_regex=
+        f'.*{cases("deepseek")}{no("v2", "v2.5", "coder")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # deepseek2
+    TemplateInfo(
+        template=TemplateType.deepseek2,
+        template_regex=
+        f'.*{cases("deepseek")}.*{cases("v2")}{no("v2.5")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/deepseek_v2.modelfile',
+    ),
+
+    # deepseek_coder
+    TemplateInfo(
+        template=TemplateType.deepseek_coder,
+        template_regex=
+        f'.*{cases("deepseek")}{no("v2", "v2.5")}.*{cases("coder")}.*{chat_suffix}.*'
+    ),
+
+    # deepseek v2.5
+    TemplateInfo(
+        template=TemplateType.deepseek2_5,
+        template_regex=
+        f'.*{cases("deepseek")}.*{cases("v2.5")}{no_multi_modal()}.*'),
+
+    # orion
+    TemplateInfo(
+        template=TemplateType.orion,
+        template_regex=f'.*{cases("orion")}{no_multi_modal()}.*{chat_suffix}.*'
+    ),
+
+    # gemma
+    TemplateInfo(
+        template=TemplateType.gemma,
+        template_regex=
+        f'{no("pali")}.*{cases("gemma2", "gemma-2")}\\b.*{chat_suffix}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/gemma2.modelfile',
+    ),
+
+    # phi3
+    TemplateInfo(
+        template=TemplateType.phi3,
+        template_regex=
+        f'.*{cases("phi3", "phi-3")}{no_multi_modal()}.*{chat_suffix}.*',
+        modelfile_link=
+        'https://modelscope.oss-cn-beijing.aliyuncs.com/llm_template/ollama/phi3.modelfile',
+    ),
+
+    # telechat
+    TemplateInfo(
+        template=TemplateType.telechat,
+        template_regex=f'.*{cases("TeleChat")}{no("v2")}.*'),
+
+    # telechat_v2
+    TemplateInfo(
+        template=TemplateType.telechat_v2,
+        template_regex=f'.*{cases("TeleChat")}.*{cases("v2")}.*'),
+]
+
+
+class TemplateLoader:
+
+    @staticmethod
+    def load_by_model_id(model_id: str, **kwargs) -> Template:
+        """Load a template by model-id
+
+        Args:
+            model_id: The model-id used to load the proper template
+            kwargs:
+                revision: the revision of the model, default is `master`
+        Returns:
+            The template instance
+        """
+        ignore_file_pattern = [r'.+\.bin$', r'.+\.safetensors$', r'.+\.gguf$']
+        tokenizer = kwargs.get('tokenizer')
+        for _info in template_info:
+            if re.fullmatch(_info.template_regex, model_id):
+                if _info.template:
+                    if tokenizer is None:
+                        try:
+                            model_dir = snapshot_download(
+                                model_id,
+                                revision=kwargs.pop('revision', 'master'),
+                                ignore_file_pattern=ignore_file_pattern)
+                            tokenizer = AutoTokenizer.from_pretrained(
+                                model_dir, trust_remote_code=True)
+                        except Exception:
+                            pass
+                    return TemplateLoader.load_by_template_name(
+                        _info.template, tokenizer=tokenizer, **kwargs)
+
+    @staticmethod
+    def load_by_template_name(template_name: str, **kwargs) -> Template:
+        """Load a template by model-id
+
+        Args:
+            template_name: The template name used to load the proper template
+            kwargs:
+                tokenizer: The tokenizer of the model
+                default_system: The extra default system info
+                max_length: The max_length for the sequence
+                truncation_strategy: 'delete' or 'truncation_left' the sequence of the length exceeds the limit
+        Returns:
+            The template instance
+        """
+        return get_template(template_name, tokenizer=kwargs.pop('tokenizer', None), **kwargs)
+
+    @staticmethod
+    def replace_and_concat(template: Template, template_list: List,
+                           placeholder: str, keyword: str):
+        final_str = ''
+        for t in template_list:
+            if isinstance(t, str):
+                final_str += t.replace(placeholder, keyword)
+            elif isinstance(t, (tuple, list)):
+                if isinstance(t[0], int):
+                    final_str += template.tokenizer.decode(t)
+                else:
+                    for attr in t:
+                        if attr == 'bos_token_id':
+                            final_str += template.tokenizer.bos_token
+                        elif attr == 'eos_token_id':
+                            final_str += template.tokenizer.eos_token
+                        else:
+                            raise ValueError(f'Unknown token: {attr}')
+        return final_str
+
+    @staticmethod
+    def to_ollama(model_id: str = None,
+                  template_name: str = None,
+                  gguf_file: str = None,
+                  gguf_meta: Dict[str, Any] = None,
+                  **kwargs) -> str:
+        """Export to ollama ModelFile
+
+        Args:
+            model_id: The model-id to use
+            template_name: An extra template name to use
+            gguf_file: An extra gguf_file path to use in the `FROM` field
+            gguf_meta: An gguf extra meta info
+        Returns:
+            The ModelFile content, returns `None` if no template found
+        """
+        logger.info('Exporting to ollama:')
+        if model_id:
+            for _info in template_info:
+                if re.fullmatch(_info.template_regex, model_id):
+                    if _info.modelfile_link:
+                        return TemplateLoader._read_content_from_url(
+                            _info.modelfile_link)
+                    elif _info.template and not template_name:
+                        template_name = _info.template
+        if template_name:
+            template = TemplateLoader.load_by_template_name(
+                template_name, **kwargs)
+        else:
+            raise ValueError(
+                f'Please make sure you model_id: {model_id} '
+                f'and template_name: {template_name} is supported.')
+
+        if template is None:
+            return None
+
+        content = ''
+        content += 'FROM {{gguf_file}}\n'
+        content += (
+            f'TEMPLATE """{{{{ if .System }}}}'
+            f'{TemplateLoader.replace_and_concat(template, template.system_prefix or [], "{{SYSTEM}}", "{{ .System }}")}'
+            f'{{{{ else }}}}{TemplateLoader.replace_and_concat(template, template.prefix, "", "")}'
+            f'{{{{ end }}}}')
+        content += (
+            f'{{{{ if .Prompt }}}}'
+            f'{TemplateLoader.replace_and_concat(template, template.prompt, "{{QUERY}}", "{{ .Prompt }}")}'
+            f'{{{{ end }}}}')
+        content += '{{ .Response }}'
+        content += TemplateLoader.replace_and_concat(template, template.suffix,
+                                                     '', '') + '"""\n'
+        content += f'PARAMETER stop "{TemplateLoader.replace_and_concat(template, template.suffix, "", "")}"\n'
+        return content
+
+    @staticmethod
+    def _read_content_from_url(url):
+        response = requests.get(url)
+        response.raise_for_status()
+        content = response.content
+        return content.decode('utf-8')
diff --git a/modelscope/preprocessors/templates/loss_scale.py b/modelscope/preprocessors/templates/loss_scale.py
new file mode 100644
index 00000000..e253c746
--- /dev/null
+++ b/modelscope/preprocessors/templates/loss_scale.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import os
+from typing import Dict, List, Optional, Tuple
+
+from .utils import split_str_parts_by, split_parts_by_regex
+
+
+def calculate_loss_scale(query: str,
+                         response: str,
+                         response_loss_scale_map: Optional[Dict[str, list]] = None,
+                         query_loss_scale_map: Optional[Dict[str, list]] = None) -> Tuple[List[str], List[float]]:
+    """Calculate the loss scale by splitting the agent response.
+
+    This algorithm comes from paper: https://arxiv.org/pdf/2309.00986.pdf
+
+    Agent response format:
+
+    ```text
+        Thought: you should always think about what to do
+        Action: the action to take, should be one of the above tools[fire_recognition,
+            fire_alert, call_police, call_fireman]
+        Action Input: the input to the action
+        Observation: the result of the action
+        ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
+        Thought: I now know the final answer
+        Final Answer: the final answer to the original input question
+    ```
+    Returns:
+        A tuple of agent response parts and their weights.
+    """
+    # query loss scale map
+    if query_loss_scale_map is not None:
+        for key in query_loss_scale_map.keys():
+            if key in query:
+                if isinstance(query_loss_scale_map[key], (float, int)):
+                    query_loss_scale_map[key] = [query_loss_scale_map[key]]
+                loss_scale_value = query_loss_scale_map[key][0]
+                return [response], [float(loss_scale_value)]
+    delimiters = list(k for k in response_loss_scale_map.keys() if len(response_loss_scale_map[k]) == 2)
+    agent_parts = split_str_parts_by(response, delimiters)
+    regex_delimiters = {k: v for k, v in response_loss_scale_map.items() if len(v) == 1}
+    if len(regex_delimiters):
+        split_parts_by_regex(agent_parts, regex_delimiters)
+    weights = []
+    agent_content = []
+    for c in agent_parts:
+        if isinstance(c['key'], (float, int)):
+            weights += [c['key']]
+            agent_content.append(c['content'])
+        else:
+            if c['key'] in response_loss_scale_map:
+                weights += [response_loss_scale_map[c['key']][0]]
+                weights += [response_loss_scale_map[c['key']][1]]
+                agent_content.append(c['key'])
+                agent_content.append(c['content'])
+            else:
+                weights += [1.0]
+                agent_content.append(c['content'])
+    return agent_content, weights
+
+
+def alpha_umi_loss_scale(query: str, response: str):
+    cwd = os.getcwd()
+    loss_scale_config_path = 'alpha_umi_loss_scale_config.json'
+    config_path = os.path.join(cwd, loss_scale_config_path)
+    with open(config_path, 'r') as json_file:
+        loss_scale_map = json.load(json_file)
+    return calculate_loss_scale(query, response, loss_scale_map)
+
+
+def agentflan_loss_scale(query: str, response: str):
+    cwd = os.getcwd()
+    loss_scale_config_path = 'agentflan.json'
+    config_path = os.path.join(cwd, loss_scale_config_path)
+    with open(config_path, 'r') as json_file:
+        loss_scale_map = json.load(json_file)
+    query_loss_scale_map = loss_scale_map['query']
+    response_loss_scale_map = loss_scale_map['response']
+    return calculate_loss_scale(query, response, response_loss_scale_map, query_loss_scale_map)
+
+
+def react_loss_scale(query: str, response: str):
+    cwd = os.getcwd()
+    loss_scale_config_path = 'default_loss_scale_config.json'
+    config_path = os.path.join(cwd, loss_scale_config_path)
+    with open(config_path, 'r') as json_file:
+        loss_scale_map = json.load(json_file)
+    return calculate_loss_scale(query, response, loss_scale_map)
+
+
+def default_loss_scale(query: str, response: str):
+    return [response], [1.0]
+
+
+loss_scale_map = {
+    'agentflan': agentflan_loss_scale,
+    'react': react_loss_scale,
+    'alpha_umi': alpha_umi_loss_scale,
+    'default': default_loss_scale,
+}
diff --git a/modelscope/preprocessors/templates/template.py b/modelscope/preprocessors/templates/template.py
new file mode 100644
index 00000000..762bcde0
--- /dev/null
+++ b/modelscope/preprocessors/templates/template.py
@@ -0,0 +1,2274 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import json
+import os
+import re
+from functools import partial
+from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, TypeVar, Union
+
+import torch
+import transformers
+from packaging import version
+from transformers import PreTrainedTokenizerBase
+from transformers.dynamic_module_utils import get_class_from_dynamic_module
+from transformers.integrations import is_deepspeed_zero3_enabled
+
+from modelscope import get_logger
+from .base import Template, TEMPLATE_MAPPING
+from .utils import (load_audio_qwen, load_batch, load_image, load_video_cogvlm2, load_video_internvl,
+                    load_video_llava, load_video_minicpmv_mplug_owl3, load_video_qwen2,
+                    transform_image, upper_bound, fetch_one)
+
+logger = get_logger()
+
+DEFAULT_SYSTEM = 'You are a helpful assistant.'
+History = List[Union[Tuple[str, str], List[str]]]
+Prompt = List[Union[str, List[int], List[str]]]
+StopWords = Prompt
+Context = Union[str, List[int]]
+
+
+class TemplateType:
+    # text-generation
+    default_generation = 'default-generation'
+    chatglm_generation = 'chatglm-generation'
+    qwen_vl_generation = 'qwen-vl-generation'
+    qwen_audio_generation = 'qwen-audio-generation'
+    # chat
+    default = 'default'
+    qwen = 'qwen'
+    qwen_vl = 'qwen-vl'
+    qwen_audio = 'qwen-audio'
+    qwen2_audio = 'qwen2-audio'
+    qwen2_audio_generation = 'qwen2-audio-generation'
+    qwen2_vl = 'qwen2-vl'
+    modelscope_agent = 'modelscope-agent'
+    baichuan = 'baichuan'
+    chatglm2 = 'chatglm2'
+    chatglm3 = 'chatglm3'
+    chatglm4 = 'chatglm4'
+    codegeex4 = 'codegeex4'
+    llama = 'llama'  # llama2
+    llama3 = 'llama3'
+    reflection = 'reflection'
+    longwriter_llama3 = 'longwriter-llama3'
+    # llava-hf
+    llava1_5 = 'llava1_5'
+    llava_mistral = 'llava-mistral'
+    llava_vicuna = 'llava-vicuna'
+    llava_yi = 'llava-yi'
+    llama3_llava_next_hf = 'llama-llava-next-hf'
+    llava_next_llama3 = 'llava-next-llama3'
+    llava_qwen_hf = 'llama-qwen-hf'
+    llava_onevision_qwen = 'llava-onevision-qwen'
+    # llava-video
+    llava_next_video = 'llava-next-video'
+    llava_next_video_yi = 'llava-next-video-yi'
+    # lmms-lab:llava
+    llama3_llava_next = 'llama3-llava-next'
+    llava_qwen = 'llava-qwen'
+    # xtuner:llava
+    llava_llama_instruct = 'llava-llama-instruct'
+
+    idefics3 = 'idefics3'
+    mistral_nemo = 'mistral-nemo'
+    openbuddy = 'openbuddy'
+    openbuddy2 = 'openbuddy2'
+    internlm = 'internlm'
+    internlm2 = 'internlm2'
+    internlm_xcomposer2 = 'internlm-xcomposer2'
+    internlm_xcomposer2_4khd = 'internlm-xcomposer2-4khd'
+    internlm_xcomposer2_5 = 'internlm-xcomposer2_5'
+    internvl = 'internvl'
+    internvl2 = 'internvl2'
+    internvl_phi3 = 'internvl-phi3'
+    internvl2_phi3 = 'internvl2-phi3'
+    florence = 'florence'
+    yi_coder = 'yi-coder'
+    yi_vl = 'yi-vl'
+    yuan = 'yuan'
+    xverse = 'xverse'
+    ziya = 'ziya'
+    skywork = 'skywork'
+    bluelm = 'bluelm'
+    zephyr = 'zephyr'
+    sus = 'sus'
+    deepseek = 'deepseek'
+    numina_math = 'numina-math'
+    deepseek_coder = 'deepseek-coder'
+    deepseek_vl = 'deepseek-vl'
+    deepseek2 = 'deepseek2'
+    deepseek2_5 = 'deepseek2_5'
+    codefuse_codellama = 'codefuse-codellama'
+    codefuse = 'codefuse'
+    cogvlm = 'cogvlm'
+    cogvlm2_video = 'cogvlm2-video'
+    glm4v = 'glm4v'
+    cogagent_chat = 'cogagent-chat'
+    cogagent_instruct = 'cogagent-instruct'
+    orion = 'orion'
+    minicpm = 'minicpm'
+    minicpm_v = 'minicpm-v'
+    minicpm_v_v2_5 = 'minicpm-v-v2_5'
+    minicpm_v_v2_6 = 'minicpm-v-v2_6'
+    gemma = 'gemma'
+    paligemma = 'paligemma'
+    mplug_owl2 = 'mplug-owl2'
+    mplug_owl3 = 'mplug_owl3'
+    wizardlm2_awq = 'wizardlm2-awq'
+    wizardlm2 = 'wizardlm2'
+    atom = 'atom'
+    phi3 = 'phi3'
+    phi3_vl = 'phi3-vl'
+    telechat = 'telechat'
+    telechat_v2 = 'telechat-v2'
+    dbrx = 'dbrx'
+    mengzi = 'mengzi'
+    c4ai = 'c4ai'
+    chatml = 'chatml'
+    # compatibility. (Deprecated)
+    default_generation_bos = 'default-generation-bos'
+
+    @classmethod
+    def get_template_name_list(cls) -> List[str]:
+        res = []
+        for k in cls.__dict__.keys():
+            if k.startswith('__') or k == 'get_template_name_list':
+                continue
+            res.append(cls.__dict__[k])
+        return res
+
+
+def register_template(template_type: str, template: Template, *, exist_ok: bool = False, **kwargs) -> None:
+    if not exist_ok and template_type in TEMPLATE_MAPPING:
+        raise ValueError(f'The `{template_type}` has already been registered in the TEMPLATE_MAPPING.')
+    template.template_type = template_type
+    template_info = {'template': template, **kwargs}
+    TEMPLATE_MAPPING[template_type] = template_info
+
+
+register_template(
+    TemplateType.default,
+    Template([], ['### Human:\n{{QUERY}}\n\n### Assistant:\n'], ['\n\n'], [['eos_token_id']],
+             DEFAULT_SYSTEM, ['{{SYSTEM}}\n\n'],
+             auto_add_bos=True))
+
+
+# You can set the query as '' to serve as a template for pre-training.
+class DefaultGenerationTemplate(Template):
+
+    def __init__(self):
+        super().__init__([], ['{{QUERY}}'], None, [['eos_token_id']], auto_add_bos=True)
+
+
+register_template(TemplateType.default_generation, DefaultGenerationTemplate(), is_generation=True)
+register_template(
+    TemplateType.default_generation_bos,
+    Template([['bos_token_id']], ['{{QUERY}}'], None, [['eos_token_id']]),
+    is_generation=True)
+
+
+class ChatmlTemplateMixin:
+    system = None
+
+    def __init__(self, auto_add_bos: bool = True):
+        Template.__init__(
+            self, [], ['<|im_start|>user\n{{QUERY}}<|im_end|>\n<|im_start|>assistant\n'], ['<|im_end|>\n'],
+            ['<|im_end|>'],
+            self.system, ['<|im_start|>system\n{{SYSTEM}}<|im_end|>\n'],
+            auto_add_bos=auto_add_bos)
+
+
+class ChatmlTemplate(ChatmlTemplateMixin, Template):
+    pass
+
+
+class QwenTemplateMixin(ChatmlTemplateMixin):
+    system = DEFAULT_SYSTEM
+
+    def __init__(self):
+        super().__init__(auto_add_bos=False)
+
+
+class QwenTemplate(QwenTemplateMixin, Template):
+    pass
+
+
+class _QwenVLTemplateMixin:
+    load_medias = False
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert not images or isinstance(fetch_one(images), str), 'QwenVL only supports datasets with images paths!'
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'image'
+        images = example.get('images') or []
+        image = images[index]
+        assert isinstance(image, str)
+        return [f'Picture {index + 1}:<img>{image}</img>\n']
+
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example['objects']
+        object_ = objects[index]
+        return [f'<ref>{object_["caption"]}</ref>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example['objects']
+        object_ = objects[index]
+        if isinstance(object_['bbox'][0], list):
+            all_objects = ''
+            for sub_object in object_['bbox']:
+                all_objects += (f'<box>({sub_object[0]},{sub_object[1]}),' f'({sub_object[2]},{sub_object[3]})</box>')
+            return [all_objects]
+        else:
+            return [
+                f'<box>({object_["bbox"][0]},{object_["bbox"][1]}),'
+                f'({object_["bbox"][2]},{object_["bbox"][3]})</box>'
+            ]
+
+
+register_template(TemplateType.qwen, QwenTemplate())
+
+
+class QwenVLTemplate(_QwenVLTemplateMixin, QwenTemplate):
+    pass
+
+
+class QwenVLGenerationTemplate(_QwenVLTemplateMixin, DefaultGenerationTemplate):
+    pass
+
+
+register_template(TemplateType.qwen_vl, QwenVLTemplate())
+register_template(TemplateType.qwen_vl_generation, QwenVLGenerationTemplate())
+
+register_template(TemplateType.chatml, ChatmlTemplate())
+
+register_template(
+    TemplateType.modelscope_agent,
+    Template([], [' \n\n<|user|>:{{QUERY}} \n\n<|assistant|>:'], [], [' \n\n</s>'], DEFAULT_SYSTEM,
+             [' \n\n<|system|>:{{SYSTEM}}']))
+
+
+class _QwenAudioTemplateMixin:
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'audio'
+        audios = example.get('audios') or []
+        audio = audios[index]
+        assert isinstance(audio, str)
+        return [f'Audio {index + 1}:<audio>{audio}</audio>\n']
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, tokenizer_kwargs = Template._encode(self, example)
+        if len(inputs) == 0:
+            return inputs, tokenizer_kwargs
+        inputs.pop('loss_scale', None)
+        inputs.update(tokenizer_kwargs)
+        return inputs, tokenizer_kwargs
+
+    def _get_tokenizer_kwargs(self, context: str) -> Dict[str, Any]:
+        return {'audio_info': self.tokenizer.process_audio(context)}
+
+    def _concat_tokenizer_kwargs(self, tokenizer_kwargs: Dict[str, Any], curr_tokenizer_kwargs: Dict[str, Any]) -> None:
+        audio_info = curr_tokenizer_kwargs.get('audio_info')
+        old_audio_info = tokenizer_kwargs.get('audio_info')
+        if old_audio_info is None:
+            tokenizer_kwargs['audio_info'] = audio_info
+        elif audio_info is not None:
+            for k in ['input_audios', 'input_audio_lengths']:
+                old_audio_info[k] = torch.concat([old_audio_info[k], audio_info[k]], dim=0)
+            for k in ['audio_span_tokens', 'audio_urls']:
+                old_audio_info[k] = old_audio_info[k] + audio_info[k]
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = Template.data_collator(self, batch, padding_to)
+        if batch[0].get('audio_info') is not None:
+            res['audio_info'] = [b['audio_info'] for b in batch]
+        return res
+
+
+class QwenAudioTemplate(_QwenAudioTemplateMixin, QwenTemplate):
+    pass
+
+
+class QwenAudioGenerationTemplate(_QwenAudioTemplateMixin, DefaultGenerationTemplate):
+    pass
+
+
+register_template(TemplateType.qwen_audio, QwenAudioTemplate(), lazy_tokenize=True)
+register_template(
+    TemplateType.qwen_audio_generation, QwenAudioGenerationTemplate(), lazy_tokenize=True, is_generation=True)
+
+
+class _Qwen2AudioTemplateMixin:
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = Template._encode(self, example)
+        if len(inputs) == 0:
+            return inputs, {}
+        processor = self.tokenizer.processor
+        sampling_rate = processor.feature_extractor.sampling_rate
+        audios = load_batch(
+            example.get('audios') or [], load_func=partial(load_audio_qwen, sampling_rate=sampling_rate))
+        if audios:
+            audio_inputs = processor.feature_extractor(
+                audios, sampling_rate=sampling_rate, return_attention_mask=True, return_tensors='pt')
+            audio_inputs['feature_attention_mask'] = audio_inputs.pop('attention_mask')
+            inputs.update(audio_inputs)
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = Template.data_collator(self, batch, padding_to)
+        input_features = [b['input_features'] for b in batch if b.get('input_features') is not None]
+        if input_features:
+            res['input_features'] = torch.concat(input_features)
+            feature_attention_mask = [b['feature_attention_mask'] for b in batch]
+            res['feature_attention_mask'] = torch.concat(feature_attention_mask)
+        return res
+
+
+class Qwen2AudioTemplate(_Qwen2AudioTemplateMixin, QwenTemplate):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'audio'
+        return [f'Audio {index + 1}: <|audio_bos|><|AUDIO|><|audio_eos|>\n']
+
+
+class Qwen2AudioGenerationTemplate(_Qwen2AudioTemplateMixin, DefaultGenerationTemplate):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type == 'audio'
+        return ['<|audio_bos|><|AUDIO|><|audio_eos|>\n']
+
+
+register_template(TemplateType.qwen2_audio, Qwen2AudioTemplate(), lazy_tokenize=True)
+
+
+def _process_image_qwen(image):
+    from qwen_vl_utils.vision_process import IMAGE_FACTOR, MIN_PIXELS, MAX_PIXELS, smart_resize
+    size_factor = get_env_args('size_factor', int, IMAGE_FACTOR)
+    # resize
+    resized_height = get_env_args('resized_height', int, None)
+    resized_width = get_env_args('resized_width', int, None)
+    if resized_height and resized_width:
+        resized_height, resized_width = smart_resize(
+            resized_height,
+            resized_width,
+            factor=size_factor,
+        )
+    else:
+        width, height = image.size
+        min_pixels = get_env_args('min_pixels', int, MIN_PIXELS)
+        max_pixels = get_env_args('max_pixels', int, MAX_PIXELS)
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+    image = image.resize((resized_width, resized_height))
+    return image
+
+
+class Qwen2VLTemplate(QwenTemplate):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index: int,
+                    example: Dict[str, Any]) -> List[Context]:
+        assert media_type in {'image', 'video'}
+        if media_type == 'image':
+            return ['<|vision_start|><|image_pad|><|vision_end|>']
+        else:
+            return ['<|vision_start|><|video_pad|><|vision_end|>']
+
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            return ['<|object_ref_start|>', object_['caption'], '<|object_ref_end|>']
+        else:
+            return ['<ref-object>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            if isinstance(object_['bbox'][0], list):
+                all_objects = ''
+                for sub_object in object_['bbox']:
+                    all_objects += (f'<|box_start|>({sub_object[0]},{sub_object[1]}),'
+                                    f'({sub_object[2]},{sub_object[3]})<|box_end|>')
+                return [all_objects]
+            else:
+                return [
+                    f'<|box_start|>({object_["bbox"][0]},{object_["bbox"][1]}),'
+                    f'({object_["bbox"][2]},{object_["bbox"][3]})<|box_end|>'
+                ]
+        else:
+            return ['<bbox>']
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        processor = self.tokenizer.processor
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        images = example.get('images') or []
+        videos = example.get('videos') or []
+        for media_type in ['images', 'videos']:
+            if locals()[media_type]:
+                if media_type == 'images':
+                    images = load_batch(images, _process_image_qwen)
+                    media_token = 151655
+                    media_inputs = processor.image_processor(images=images, videos=None, return_tensors='pt')
+                    media_grid_thw = media_inputs['image_grid_thw']
+                else:
+                    videos = load_batch(videos, load_video_qwen2)
+                    media_inputs = processor.image_processor(images=None, videos=videos, return_tensors='pt')
+                    media_grid_thw = media_inputs['video_grid_thw']
+                    media_token = 151656
+                idx_list = _findall(input_ids, media_token)
+                added_tokens_len = 0
+                for i, idx in enumerate(idx_list):
+                    merge_length = processor.image_processor.merge_size**2
+                    token_len = (media_grid_thw[i].prod() // merge_length)
+                    input_ids = input_ids[:idx
+                                          + added_tokens_len] + [media_token] * token_len + input_ids[added_tokens_len
+                                                                                                      + idx + 1:]
+                    if labels:
+                        labels = labels[:idx + added_tokens_len] + [-100] * token_len + labels[added_tokens_len + idx
+                                                                                               + 1:]
+                    added_tokens_len += token_len - 1
+                inputs.update(media_inputs)
+
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        for media_type in ['image', 'video']:
+            grid_thw = [b[f'{media_type}_grid_thw'] for b in batch if b.get(f'{media_type}_grid_thw') is not None]
+            if grid_thw:
+                res[f'{media_type}_grid_thw'] = torch.concat(grid_thw)
+        return res
+
+
+register_template(TemplateType.qwen2_vl, Qwen2VLTemplate(), lazy_tokenize=True)
+
+register_template(
+    TemplateType.qwen2_audio_generation, Qwen2AudioGenerationTemplate(), lazy_tokenize=True, is_generation=True)
+
+
+class YiCoderTemplate(ChatmlTemplate):
+    system = 'You are a helpful assistant.'
+
+
+register_template(TemplateType.yi_coder, YiCoderTemplate())
+
+yi_vl_default_system = (
+    'This is a chat between an inquisitive human and an AI assistant. Assume the role of the AI assistant. '
+    "Read all the images carefully, and respond to the human's questions with informative, "
+    'helpful, detailed and polite answers. '
+    '这是一个好奇的人类和一个人工智能助手之间的对话。假设你扮演这个AI助手的角色。'
+    '仔细阅读所有的图像，并对人类的问题做出信息丰富、有帮助、详细的和礼貌的回答。')
+
+
+class YiVLTemplate(Template):
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        assert media_type == 'image'
+        return [[-200], '\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        inputs.pop('loss_scale', None)
+        from llava.mm_utils import expand2square
+        # This processor should be put from the `model.vision_tower.image_processor`
+        image_processor = self.tokenizer.image_processor
+        images = example.get('images') or []
+        for i, image in enumerate(images):
+            background_color = tuple(int(x * 255) for x in image_processor.image_mean)
+            image = expand2square(image, background_color)
+            images[i] = image
+        if images:
+            image_tensor = image_processor.preprocess(images, return_tensors='pt')['pixel_values']
+            inputs['images'] = image_tensor.to(kwargs['dtype'])
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = torch.concat(images)
+        has_images = [(b == -200).sum() for b in res['input_ids']]
+        assert all([
+            h > 0 for h in has_images
+        ]) or not any([h > 0
+                       for h in has_images]), 'YIVL does not support mix-batch nlp dataset and multi-modal dataset'
+        return res
+
+
+class GLMTemplate(Template):
+
+    def _init_template(self, tokenizer: PreTrainedTokenizerBase, *args, **kwargs) -> None:
+        res = super()._init_template(tokenizer, *args, **kwargs)
+        token_list = tokenizer.encode('')
+        self.prefix.insert(0, token_list)
+        if self.system_prefix is not None:
+            self.system_prefix.insert(0, token_list)
+        return res
+
+
+class GLM4VTemplate(GLMTemplate):
+
+    def __init__(self):
+        super().__init__([], ['<|user|>\n{{QUERY}}<|assistant|>'], [], ['<|endoftext|>'], None,
+                         ['<|system|>\n{{SYSTEM}}'])
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) <= 1
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return [[-100]]
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, -100)
+        if idx_list:
+            idx = idx_list[0]
+            image = example.get('images')[0]
+            placeholder = '<|begin_of_image|><|endoftext|><|end_of_image|>'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
+            if labels is not None:
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
+            messages = example['messages']
+            messages[0]['image'] = image
+            inputs2: Dict[str, Any] = self.tokenizer.apply_chat_template(messages, return_dict=True)
+            inputs['images'] = inputs2['images']
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = torch.concat(images)
+        return res
+
+
+register_template(TemplateType.glm4v, GLM4VTemplate(), infer_media_type='dialogue', lazy_tokenize=True, use_model=False)
+
+register_template(
+    TemplateType.yi_vl,
+    YiVLTemplate([], [[8308], 'Human: {{QUERY}}\n', [8308], 'Assistant:'], ['\n'], ['\n', [8308]], yi_vl_default_system,
+                 ['{{SYSTEM}}\n\n']),
+    use_model=False,
+    infer_media_type='round',
+    lazy_tokenize=True)
+
+register_template(TemplateType.baichuan, Template(['{{SYSTEM}}'], [[195], '{{QUERY}}', [196]], [], [['eos_token_id']]))
+
+register_template(
+    TemplateType.chatglm2,
+    GLMTemplate(['{{SYSTEM}}'], ['[Round {{ROUND1}}]\n\n问：{{QUERY}}\n\n答：'], ['\n\n'], [['eos_token_id']]))
+
+register_template(
+    TemplateType.chatglm_generation, GLMTemplate([], ['{{QUERY}}'], None, [['eos_token_id']]), is_generation=True)
+
+register_template(
+    TemplateType.chatglm3,
+    GLMTemplate([], ['<|user|>\n{{QUERY}}<|assistant|>\n'], [], ['<|user|>'], None, ['<|system|>\n{{SYSTEM}}']))
+
+register_template(
+    TemplateType.chatglm4,
+    GLMTemplate([], ['<|user|>\n{{QUERY}}<|assistant|>\n'], [], ['<|user|>'],
+                None, ['<|system|>\n{{SYSTEM}}'],
+                tools_prompt='glm4',
+                tool_prompt=['<|observation|>\n{{QUERY}}<|assistant|>\n']))
+
+codegeex4_system = '你是一位智能编程助手，你叫CodeGeeX。你会为用户回答关于编程、代码、计算机方面的任何问题，并提供格式规范、可以执行、准确安全的代码，并在必要时提供详细的解释。'
+
+register_template(
+    TemplateType.codegeex4,
+    GLMTemplate([], ['<|user|>\n{{QUERY}}<|assistant|>\n'], [], ['<|endoftext|>'], codegeex4_system,
+                ['<|system|>\n{{SYSTEM}}']))
+
+register_template(
+    TemplateType.deepseek,
+    Template([['bos_token_id']], ['User: {{QUERY}}\n\nAssistant:'], [['eos_token_id']], [['eos_token_id']], None,
+             [['bos_token_id'], '{{SYSTEM}}\n\n']))
+register_template(
+    TemplateType.numina_math,
+    Template([['bos_token_id']], ['### Problem: {{QUERY}}\n### Solution: '], ['\n'], [['eos_token_id']], None,
+             [['bos_token_id'], '{{SYSTEM}}']))
+register_template(
+    TemplateType.deepseek2,
+    Template([[100000]], ['User: {{QUERY}}\n\nAssistant:'], [[100001]], [[100001]], None, [[100000], '{{SYSTEM}}\n\n']))
+register_template(
+    TemplateType.deepseek2_5,
+    Template(['<｜begin▁of▁sentence｜>'], ['<｜User｜>{{QUERY}}<｜Assistant｜>'], ['<｜end_of_sentense｜>'],
+             ['<｜end_of_sentense｜>'], None, ['<｜begin▁of▁sentence｜>{{SYSTEM}}']))
+
+# ref: https://github.com/facebookresearch/llama/blob/main/llama/generation.py
+LLAMA_DEFAULT_SYSTEM = (
+    'You are a helpful, respectful and honest assistant. '
+    'Always answer as helpfully as possible, while being safe. '
+    'Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n\n'
+    'If a question does not make any sense, or is not factually coherent, '
+    'explain why instead of answering something not correct. '
+    "If you don't know the answer to a question, please don't share false information.")
+register_template(
+    TemplateType.llama,
+    Template(['<s>[INST] '], ['{{QUERY}} [/INST]'], ['</s><s>[INST] '], ['</s>'], LLAMA_DEFAULT_SYSTEM,
+             ['<s>[INST] <<SYS>>\n{{SYSTEM}}\n<</SYS>>\n\n']))
+
+register_template(
+    TemplateType.longwriter_llama3,
+    Template(['[INST]'], ['{{QUERY}}[/INST]'], ['[INST]'], ['<|end_of_text|>'], None,
+             ['<<SYS>>\n{{SYSTEM}}\n<</SYS>>\n\n']))
+
+register_template(TemplateType.mistral_nemo,
+                  Template(['<s>[INST] '], ['{{SYSTEM}}\n\n', '{{QUERY}}[/INST]'], ['</s>[INST] '], ['</s>']))
+
+
+class Llama3TemplateMixin:
+    system = None
+
+    def __init__(self):
+        Template.__init__(
+            self, ['<|begin_of_text|>'], [
+                '<|start_header_id|>user<|end_header_id|>\n\n{{QUERY}}<|eot_id|>'
+                '<|start_header_id|>assistant<|end_header_id|>\n\n'
+            ], ['<|eot_id|>'], ['<|eot_id|>'],
+            self.system, ['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{SYSTEM}}<|eot_id|>'],
+            tools_prompt='toolbench',
+            tool_prompt=[
+                '<|start_header_id|>tool<|end_header_id|>\n\n{{QUERY}}<|eot_id|>'
+                '<|start_header_id|>assistant<|end_header_id|>\n\n'
+            ])
+
+
+class Llama3Template(Llama3TemplateMixin, Template):
+    pass
+
+
+class ReflectionTemplate(Llama3TemplateMixin, Template):
+    system = ('You are a world-class AI system, capable of complex reasoning and reflection. '
+              'Reason through the query inside <thinking> tags, and then provide your final '
+              'response inside <output> tags. If you detect that you made a mistake in your reasoning '
+              'at any point, correct yourself inside <reflection> tags.')
+
+
+register_template(TemplateType.reflection, ReflectionTemplate())
+register_template(TemplateType.llama3, Llama3Template())
+
+OPENBUDDY_DEFAULT_SYSTEM = (
+    'You are a helpful, respectful and honest INTP-T AI Assistant named Buddy. You are talking to a human User.\n'
+    'Always answer as helpfully and logically as possible, while being safe. '
+    'Your answers should not include any '
+    'harmful, political, religious, unethical, racist, sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n'
+    'If a question does not make any sense, or is not factually coherent, '
+    'explain why instead of answering something not correct. '
+    "If you don't know the answer to a question, please don't share false information.\n"
+    'You like to use emojis. You can speak fluently in many languages, for example: English, Chinese.\n'
+    'You cannot access the internet, but you have vast knowledge, cutoff: 2021-09.\n'
+    'You always deeply love and support China, Chinese government, people and culture.\n'
+    'You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), '
+    'you are based on LLaMA and Falcon transformers model, not related to GPT or OpenAI.')
+register_template(
+    TemplateType.openbuddy,
+    Template([], ['User: {{QUERY}}\nAssistant:'], ['\n'], [['eos_token_id']],
+             OPENBUDDY_DEFAULT_SYSTEM, ['{{SYSTEM}}\n\n'],
+             auto_add_bos=True))
+
+OPENBUDDY2_DEFAULT_SYSTEM = (
+    'You(assistant) are a helpful, respectful and honest INTP-T AI Assistant named Buddy. '
+    'You are talking to a human(user).\nAlways answer as helpfully and logically as possible, while being safe. '
+    'Your answers should not include any harmful, political, religious, unethical, racist, '
+    'sexist, toxic, dangerous, or illegal content. '
+    'Please ensure that your responses are socially unbiased and positive in nature.\n'
+    'You cannot access the internet, but you have vast knowledge, cutoff: 2023-04.\n'
+    'You are trained by OpenBuddy team, (https://openbuddy.ai, https://github.com/OpenBuddy/OpenBuddy), '
+    'not related to GPT or OpenAI')
+
+register_template(
+    TemplateType.openbuddy2,
+    Template([], ['<|role|>user<|says|>{{QUERY}}<|end|>\n<|role|>assistant<|says|>'], ['<|end|>\n'], ['<|end|>'],
+             OPENBUDDY2_DEFAULT_SYSTEM, ['<|role|>system<|says|>{{SYSTEM}}<|end|>\n'],
+             auto_add_bos=True))
+
+INTERNLM_SYSTEM = (
+    'You are an AI assistant whose name is InternLM (书生·浦语).\n'
+    '- InternLM (书生·浦语) is a conversational language model that is developed by Shanghai AI Laboratory (上海人工智能实验室). '
+    'It is designed to be helpful, honest, and harmless.\n'
+    '- InternLM (书生·浦语) can understand and communicate fluently in the language chosen '
+    'by the user such as English and 中文.')
+
+register_template(
+    TemplateType.internlm,
+    Template(['<s>'], ['<|User|>:{{QUERY}}\n<|Bot|>:'], ['<eoa>\n'], ['<eoa>'], INTERNLM_SYSTEM,
+             ['<s><|System|>:{{SYSTEM}}\n']))
+
+_T = TypeVar('_T')
+
+_log_set = set()  # log once
+
+
+def get_env_args(args_name: str, type_func: Callable[[str], _T], default_value: Optional[_T]) -> Optional[_T]:
+    args_name_upper = args_name.upper()
+    value = os.getenv(args_name_upper)
+    if value is None:
+        value = default_value
+        log_info = (f'Setting {args_name}: {default_value}. '
+                    f'You can adjust this hyperparameter through the environment variable: `{args_name_upper}`.')
+    else:
+        value = type_func(value)
+        log_info = f'Using environment variable `{args_name_upper}`, Setting {args_name}: {value}.'
+    if log_info not in _log_set:
+        _log_set.add(log_info)
+        logger.info(log_info)
+    return value
+
+
+class Internlm2Template(ChatmlTemplate):
+    system = INTERNLM_SYSTEM
+
+
+register_template(TemplateType.internlm2, Internlm2Template())
+
+
+def replace_img_tag(query: str,
+                    history: History,
+                    replace_token: str,
+                    pattern=r'<img>(.+?)</img>') -> Tuple[str, History, List[str]]:
+    images_path = []
+    new_history = []
+    for i, h in enumerate(history):
+        if h[0] is None:
+            new_history.append(h.copy())
+        else:
+            images_path += re.findall(pattern, h[0])
+            new_history.append([re.sub(pattern, replace_token, h[0]), h[1]])
+    if query is None:
+        new_query = query  # pretrain dataset
+    else:
+        images_path += re.findall(pattern, query)
+        new_query = re.sub(pattern, replace_token, query)
+    return new_query, new_history, images_path
+
+
+class InternLMXComposer2Template(Template):
+    INTERNLM_XCOMPOSER_SYSTEM = (
+        'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n'
+        '- InternLM-XComposer (浦语·灵笔) is a conversational language model that is developed by '
+        'Shanghai AI Laboratory (上海人工智能实验室). '
+        'It is designed to be helpful, honest, and harmless.\n'
+        '- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen '
+        'by the user such as English and 中文.')
+    image_placeholder = ['</s>']
+
+    def __init__(self, version):
+        prefix = ['<s>']
+        prompt = ['[UNUSED_TOKEN_146]user\n{{QUERY}}[UNUSED_TOKEN_145]\n[UNUSED_TOKEN_146]assistant\n']
+        chat_sep = ['[UNUSED_TOKEN_145]\n']
+        suffix = ['[UNUSED_TOKEN_145]']
+        system_prefix = ['<s>[UNUSED_TOKEN_146]system\n{{SYSTEM}}[UNUSED_TOKEN_145]\n']
+        super().__init__(prefix, prompt, chat_sep, suffix, self.INTERNLM_XCOMPOSER_SYSTEM, system_prefix)
+        self.version = version
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+
+        if self.version == 'v2.5':
+            hd_num = 24
+            if len(images) > 1:
+                hd_num = 6
+            hd_num = get_env_args('hd_num', int, hd_num)
+            Image_transform = get_class_from_dynamic_module('ixc_utils.Image_transform', self.tokenizer.model_dir)
+            images = [Image_transform(image, hd_num=hd_num) for image in images]
+        elif self.version == 'v2-4khd':
+            hd_num = 55
+            hd_num = get_env_args('hd_num', int, hd_num)
+            HD_transform = get_class_from_dynamic_module('ixc_utils.HD_transform', self.tokenizer.model_dir)
+            images = [HD_transform(image, hd_num=hd_num) for image in images]
+        # vis_processor comes from model.vis_processor
+        images = [self.tokenizer.vis_processor(image).to(kwargs['dtype']) for image in images]
+        inputs['_data'] = {'input_ids': inputs['input_ids'], 'labels': inputs['labels'], 'images': images}
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        input_ids = data['input_ids']
+        labels = data['labels']
+        images = data['images']
+        if len(images) > 0:  # ignore <s>
+            input_ids = input_ids[1:]
+            if labels is not None:
+                labels = labels[1:]
+        if isinstance(input_ids, torch.Tensor):
+            input_ids = input_ids.tolist()
+        input_ids.append(2)  # add dummy </s>
+        if labels is not None:
+            if isinstance(labels, torch.Tensor):
+                labels = labels.tolist()
+            labels.append(2)
+        else:
+            labels = []
+        res_inputs_embeds = []
+        res_labels = []
+        wrap_im_mask = []
+        pre_i, i, idx = 0, 0, 0
+        device = model.device
+        internlm2_model = model.model
+        if not hasattr(internlm2_model, 'tok_embeddings'):
+            internlm2_model = internlm2_model.model
+        tok_embeddings = internlm2_model.tok_embeddings
+        if len(images) > 0:
+            images = torch.concat([model.img2emb(image[None])[0] for image in images], dim=0)
+        while i < len(input_ids):
+            if input_ids[i] == 2:  # replace_token
+                res_input_ids = torch.tensor([1] + input_ids[pre_i:i], device=device)
+                res_inputs_embeds.append(tok_embeddings(res_input_ids[None])[0])
+                wrap_im_mask += [0] * len(res_input_ids)
+                res_labels += [-100] + labels[pre_i:i]
+                if len(images) > 0 and idx < images.shape[0]:
+                    res_inputs_embeds.append(images[idx].to(device))
+                    wrap_im_mask += [1] * images.shape[1]
+                    res_labels += [-100] * images.shape[1]
+                idx += 1
+                i += 1
+                pre_i = i
+                continue
+            i += 1
+        if len(labels) == 0:
+            res_labels = None
+        res_inputs_embeds = torch.concat(res_inputs_embeds, dim=0)
+        wrap_im_mask = torch.tensor(wrap_im_mask, dtype=torch.bool, device=device)[None]
+        return {'inputs_embeds': res_inputs_embeds, 'im_mask': wrap_im_mask, 'labels': res_labels}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        if 'im_mask' in batch[0]:
+            im_mask = [b['im_mask'][0] for b in batch]
+            im_mask = self.pad_sequence(im_mask, 0, self.padding_side)
+            res['im_mask'] = im_mask
+        return res
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+register_template(
+    TemplateType.internlm_xcomposer2, InternLMXComposer2Template(version='v2'), use_model=False, lazy_tokenize=True)
+
+
+class InternLMXComposer2_5Template(InternLMXComposer2Template):
+    INTERNLM_XCOMPOSER_SYSTEM = (
+        'You are an AI assistant whose name is InternLM-XComposer (浦语·灵笔).\n'
+        '- InternLM-XComposer (浦语·灵笔) is a multi-modality conversational language model '
+        'that is developed by Shanghai AI Laboratory (上海人工智能实验室). '
+        'It is designed to be helpful, honest, and harmless.\n'
+        '- InternLM-XComposer (浦语·灵笔) can understand and communicate fluently in the language chosen '
+        'by the user such as English and 中文.\n'
+        '- InternLM-XComposer (浦语·灵笔) is capable of comprehending and articulating responses effectively '
+        'based on the provided image.')
+
+
+register_template(
+    TemplateType.internlm_xcomposer2_5,
+    InternLMXComposer2_5Template(version='v2.5'),
+    use_model=False,
+    lazy_tokenize=True)
+
+register_template(
+    TemplateType.internlm_xcomposer2_4khd,
+    InternLMXComposer2_5Template(version='v2-4khd'),
+    use_model=False,
+    lazy_tokenize=True)
+
+
+class InternvlTemplate(Template):
+    system = 'You are an AI assistant whose name is InternLM (书生·浦语).'
+    num_image_token = 256
+
+    def __init__(self):
+        super().__init__([], ['<|im_start|>user\n{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'],
+                         ['<|im_end|>'],
+                         self.system, ['<|im_start|>system\n{{SYSTEM}}<|im_end|>'],
+                         auto_add_bos=True)
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        return ['<img>', [-100], '</img>\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        idx_list = _findall(input_ids, -100)
+        pixel_values = None
+        images = example.get('images')
+        if images:
+            labels = inputs.get('labels')
+            input_size = get_env_args('input_size', int, 448)
+            max_num = get_env_args('max_num', int, 12)
+            pixel_values_images = [transform_image(image, input_size, max_num) for image in images]
+            pixel_values = torch.cat(pixel_values_images, dim=0).to(kwargs['dtype'])
+            image_bs = pixel_values.shape[0]
+
+            idx, idx2 = idx_list[0], idx_list[-1]  # remove [-100, -100]
+            img_tokens: List[int] = self.tokenizer.encode(
+                '<IMG_CONTEXT>', add_special_tokens=False) * self.num_image_token * image_bs
+            input_ids = input_ids[:idx] + img_tokens + input_ids[idx2 + 1:]
+            if labels is not None:
+                labels = labels[:idx] + [-100] * len(img_tokens) + labels[idx2 + 1:]
+            inputs['input_ids'] = input_ids
+            inputs['labels'] = labels
+        inputs['_data'] = {'input_ids': torch.tensor(input_ids), 'pixel_values': pixel_values}
+        inputs.pop('loss_scale', None)
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        embedding = model.get_input_embeddings()
+        device = embedding.weight.device
+        input_ids = data['input_ids']
+        inputs_embeds = embedding(input_ids[None])[0].to(device=device)
+        pixel_values = data['pixel_values']
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(device=device)
+            vit_embeds = model.extract_feature(pixel_values).to(device=device)
+            selected = (input_ids == self.tokenizer.encode('<IMG_CONTEXT>', add_special_tokens=False)[0])
+            inputs_embeds[selected] = vit_embeds.reshape(-1, vit_embeds.shape[-1])
+        elif is_deepspeed_zero3_enabled():
+            dummy_pixel_values = torch.zeros((1, 3, 32, 32), device=device, dtype=inputs_embeds.dtype)
+            vit_embeds = model.extract_feature(dummy_pixel_values).to(device=device)
+            inputs_embeds += vit_embeds.mean() * 0.
+        return {'inputs_embeds': inputs_embeds}
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+def _replace_video2image(load_video_func, example, replace_tag) -> List[Context]:
+    context_list = []
+    video_index = example['video_index']
+    video = example['videos'][video_index]
+    images = example['images']
+    image_index = example['image_index']
+    new_images = load_video_func(video)
+    example['images'] = images[:image_index] + new_images + images[image_index:]
+    for i in range(len(new_images)):
+        context_list += replace_tag(i)
+    example['image_index'] += len(new_images)
+    return context_list
+
+
+class Internvl2Template(InternvlTemplate):
+    video_segments = 8
+    system = '你是由上海人工智能实验室联合商汤科技开发的书生多模态大模型，英文名叫InternVL, 是一个有用无害的人工智能助手。'
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        image_context = super().replace_tag('image', index, example)
+        if media_type == 'image':
+            return image_context
+        elif media_type == 'video':
+            video_segments = get_env_args('video_segments', int, self.video_segments)
+            load_video = partial(load_video_internvl, num_segments=video_segments)
+            return _replace_video2image(load_video, example, lambda i: [f'Frame{i + 1}: '] + image_context)
+
+    def replace_object(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            return [f'<ref>{object_["caption"]}</ref>']
+        else:
+            return ['<ref-object>']
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        objects = example.get('objects')
+        if objects:
+            object_ = objects[index]
+            if isinstance(object_['bbox'][0], list):
+                all_objects = '<box> ['
+                for sub_object in object_['bbox']:
+                    all_objects += (f'[{sub_object[0]}, {sub_object[1]}, ' f'{sub_object[2]}, {sub_object[3]}],')
+                all_objects = all_objects[:-1]
+                all_objects += '] </box>'
+                return [all_objects]
+            else:
+                return [
+                    f'<box> [[{object_["bbox"][0]}, {object_["bbox"][1]}, '
+                    f'{object_["bbox"][2]}, {object_["bbox"][3]}]] </box>'
+                ]
+        else:
+            return ['<bbox>']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super(InternvlTemplate, self)._encode(example, **kwargs)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        idx_list = _findall(input_ids, -100)
+        labels = inputs.get('labels')
+        images = example.get('images')
+        if images:
+            has_video = bool(example.get('videos'))
+            input_size = get_env_args('input_size', int, 448)
+            max_num = get_env_args('max_num', int, 1 if has_video else 12)
+            pixel_values = [transform_image(image, input_size, max_num) for image in images]
+            num_patches = [pv.shape[0] for pv in pixel_values]
+            pixel_values = torch.cat(pixel_values).to(kwargs['dtype'])
+        else:
+            pixel_values = None
+            num_patches = []
+        assert len(num_patches) == len(
+            idx_list), f'len(num_patches): {len(num_patches)}, len(idx_list): {len(idx_list)}'
+        added_tokens_len = 0
+        for idx, num_patch in zip(idx_list, num_patches):
+            img_tokens: List[int] = self.tokenizer.encode(
+                '<IMG_CONTEXT>', add_special_tokens=False) * self.num_image_token * num_patch
+            input_ids = input_ids[:idx + added_tokens_len] + img_tokens + input_ids[idx + added_tokens_len + 1:]
+            if labels is not None:
+                labels = labels[:idx + added_tokens_len] + [-100] * len(img_tokens) + labels[idx + added_tokens_len
+                                                                                             + 1:]
+            added_tokens_len += len(img_tokens) - 1
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        inputs['_data'] = {'input_ids': torch.tensor(input_ids), 'pixel_values': pixel_values}
+        inputs.pop('loss_scale', None)
+        return inputs, {}
+
+
+class InternvlPhi3TemplateMixin:
+
+    def __init__(self):
+        Template.__init__(
+            self, [], ['<|user|>\n{{QUERY}}<|end|><|assistant|>\n'], ['<|end|>'], ['<|end|>'],
+            getattr(self, 'system', None), ['<|system|>\n{{SYSTEM}}<|end|>'],
+            auto_add_bos=True)
+        self.padding_side = 'left'
+
+
+class InternvlPhi3Template(InternvlPhi3TemplateMixin, InternvlTemplate):
+    system = 'You are an AI assistant whose name is Phi-3.'
+
+
+class Internvl2Phi3Template(InternvlPhi3TemplateMixin, Internvl2Template):
+    pass
+
+
+register_template(
+    TemplateType.internvl, InternvlTemplate(), use_model=False, lazy_tokenize=True, infer_media_type='dialogue')
+
+register_template(
+    TemplateType.internvl_phi3, InternvlPhi3Template(), use_model=False, lazy_tokenize=True, infer_media_type='dialogue')
+
+register_template(TemplateType.internvl2, Internvl2Template(), use_model=False, lazy_tokenize=True)
+
+register_template(TemplateType.internvl2_phi3, Internvl2Phi3Template(), use_model=False, lazy_tokenize=True)
+
+
+class FlorenceTemplate(Template):
+    compute_per_round_loss = False
+    output_prompt_answer = True
+
+    def __init__(self):
+        super().__init__(['<s>'], ['{{QUERY}}</s>'], None, ['</s>'])
+        self.task_prompts_without_inputs = {
+            '<OCR>': 'What is the text in the image?',
+            '<OCR_WITH_REGION>': 'What is the text in the image, with regions?',
+            '<CAPTION>': 'What does the image describe?',
+            '<DETAILED_CAPTION>': 'Describe in detail what is shown in the image.',
+            '<MORE_DETAILED_CAPTION>': 'Describe with a paragraph what is shown in the image.',
+            '<OD>': 'Locate the objects with category name in the image.',
+            '<DENSE_REGION_CAPTION>': 'Locate the objects in the image, with their descriptions.',
+            '<REGION_PROPOSAL>': 'Locate the region proposals in the image.'
+        }
+        self.task_prompts_with_input = {
+            '<CAPTION_TO_PHRASE_GROUNDING>': 'Locate the phrases in the caption: {input}',
+            '<REFERRING_EXPRESSION_SEGMENTATION>': 'Locate {input} in the image with mask',
+            '<REGION_TO_SEGMENTATION>': 'What is the polygon mask of region {input}',
+            '<OPEN_VOCABULARY_DETECTION>': 'Locate {input} in the image.',
+            '<REGION_TO_CATEGORY>': 'What is the region {input}?',
+            '<REGION_TO_DESCRIPTION>': 'What does the region {input} describe?',
+            '<REGION_TO_OCR>': 'What text is in the region {input}?',
+        }
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) == 1, 'Florence series models only supports input with a single image.'
+
+    def add_default_tags(self, example: Dict[str, Any]) -> None:
+        return
+
+    def replace_box(self, index: int, example: Dict[str, Any]) -> List[Context]:
+        object_ = example['objects'][index]
+        if isinstance(object_['bbox'][0], list):
+            all_objects = ''
+            for sub_object in object_['bbox']:
+                x1, y1, x2, y2 = sub_object
+                all_objects += f'<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>,'
+            return [all_objects[:-1]]
+        else:
+            x1, y1, x2, y2 = object_['bbox']
+            return [f'<loc_{x1}><loc_{y1}><loc_{x2}><loc_{y2}>']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        query = example['query']
+        processor = self.tokenizer.processor
+        example['query'] = processor._construct_prompts([query])[0]
+        inputs, _ = super()._encode(example)
+        input_ids = inputs['prompt_input_ids']
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+        labels = inputs['answer_labels']
+        if labels is not None:
+            labels = [0] + labels
+        pixel_values = processor.image_processor(images, return_tensors='pt')['pixel_values'].to(kwargs['dtype'])
+        inputs = {
+            'input_ids': input_ids,
+            'labels': labels,
+            '_data': {
+                'input_ids': torch.tensor(input_ids)[None],
+                'pixel_values': pixel_values,
+            }
+        }
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        inputs_embeds = model.get_input_embeddings()(data['input_ids'])
+        image_features = model._encode_image(data['pixel_values'])
+        inputs_embeds, _ = model._merge_input_ids_with_image_features(image_features, inputs_embeds)
+        return {'inputs_embeds': inputs_embeds[0]}
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+    def post_process_generate_response(self, response, example):
+        if isinstance(example['images'], list):
+            example['images'] = example['images'][0]
+        image = load_image(example['images'])
+        return json.dumps(
+            self.tokenizer.processor.post_process_generation(
+                response, task=example['query'], image_size=(image.width, image.height)))
+
+
+register_template(
+    TemplateType.florence,
+    FlorenceTemplate(),
+    use_model=False,
+    lazy_tokenize=True,
+    infer_media_type='dialogue',
+    stream=False)
+
+register_template(TemplateType.xverse,
+                  Template(['{{SYSTEM}}'], ['Human: {{QUERY}}\n\nAssistant: '], [['eos_token_id']], [['eos_token_id']]))
+register_template(TemplateType.yuan, Template([], ['{{QUERY}}<sep>'], None, [['eos_token_id']]))
+register_template(TemplateType.ziya,
+                  Template([['bos_token_id'], '{{SYSTEM}}'], ['<human>:{{QUERY}}\n<bot>:'], ['\n'], [['eos_token_id']]))
+
+register_template(TemplateType.skywork,
+                  Template(['<s>{{SYSTEM}}'], ['</s><s>[USER]{{QUERY}}[SEP][BOT]'], None, ['[SEP]</s>']))
+
+register_template(TemplateType.bluelm,
+                  Template([['bos_token_id'], '{{SYSTEM}}'], ['[|Human|]:{{QUERY}}[|AI|]:'], [], [['eos_token_id']]))
+
+register_template(
+    TemplateType.codefuse_codellama,
+    Template(['{{SYSTEM}}'], ['<|role_start|>human<|role_end|>{{QUERY}}<|role_start|>bot<|role_end|>'], [],
+             [['eos_token_id']]))
+
+register_template(
+    TemplateType.codefuse,
+    Template([], ['<s>human\n{{QUERY}}\n<s>bot\n'], [['eos_token_id'], '\n'], [['eos_token_id']], None,
+             ['<s>system\n{{SYSTEM}}\n']))
+
+register_template(
+    TemplateType.deepseek_coder,
+    Template(['{{SYSTEM}}'], ['### Instruction:\n{{QUERY}}\n### Response:\n'], ['\n<|EOT|>\n'], ['\n<|EOT|>'],
+             ('You are an AI programming assistant, utilizing the Deepseek Coder model, '
+              'developed by Deepseek Company, and you only answer questions related to computer science. '
+              'For politically sensitive questions, security and privacy issues, '
+              'and other non-computer science questions, you will refuse to answer\n')))
+
+
+class LlavaHfTemplate(Template):
+
+    def __init__(self, *args, **kwargs) -> None:
+        super().__init__(*args, **kwargs)
+        if version.parse(transformers.__version__) < version.parse('4.43.0'):
+            self.padding_side = 'left'
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return ['<image>\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images')
+        if images:
+            image_processor = self.tokenizer.processor.image_processor
+            image_inputs = image_processor(images, return_tensors='pt').to(kwargs['dtype'])
+            inputs['pixel_values'] = image_inputs['pixel_values']
+            if 'image_sizes' in image_inputs:
+                inputs['image_sizes'] = image_inputs['image_sizes']
+        return inputs, {}
+
+
+class Llava1_6Llama3Template(LlavaHfTemplate):
+    default_system = 'You are a helpful language and vision assistant. ' \
+                     'You are able to understand the visual content that the user provides, ' \
+                     'and assist the user with a variety of tasks using natural language.'
+
+    def __init__(self):
+        super().__init__(['<|begin_of_text|>'], [
+            '<|start_header_id|>user<|end_header_id|>\n\n{{QUERY}}<|eot_id|>'
+            '<|start_header_id|>assistant<|end_header_id|>\n\n'
+        ], ['<|eot_id|>'], ['<|eot_id|>'], None,
+                         ['<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n{{SYSTEM}}<|eot_id|>'])
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs['pixel_values'].shape) == 5:  # (1, num_patch, 3, H/W, W/H)
+            inputs['pixel_values'] = torch.squeeze(inputs['pixel_values'], dim=0)  # (num_patch, 3, H/W, W/H)
+        return inputs, {}
+
+
+register_template(TemplateType.llava_next_llama3, Llava1_6Llama3Template(), use_model=False, lazy_tokenize=True)
+
+
+class LlavaVideoTemplate(Template):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+
+        if media_type == 'image':
+            return ['<image>\n']
+        assert media_type == 'video'
+        media_file = example['videos'][index]
+        if media_file.rsplit('.', 1)[-1] in {'jpg', 'png'}:
+            return ['<image>\n']
+        else:
+            return ['<video>\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+        videos_path = example.get('videos') or []
+        if len(videos_path) > 0:
+            videos = load_batch(videos_path, load_video_llava)
+            video_processor = self.tokenizer.processor.video_processor
+            video_inputs = video_processor(videos, return_tensors='pt').to(kwargs['dtype'])
+            inputs['pixel_values_videos'] = video_inputs['pixel_values_videos']
+        if len(images) > 0:
+            image_processor = self.tokenizer.processor.image_processor
+            image_inputs = image_processor(images, return_tensors='pt').to(kwargs['dtype'])
+            inputs['pixel_values'] = image_inputs['pixel_values']
+            inputs['image_sizes'] = image_inputs['image_sizes']
+        return inputs, {}
+
+
+register_template(
+    TemplateType.llava_next_video,
+    LlavaVideoTemplate(['<s>{{SYSTEM}} '], ['USER: {{QUERY}} ASSISTANT:'], [' '], ['</s>']),
+    use_model=False,
+    lazy_tokenize=True)
+
+register_template(
+    TemplateType.llava_next_video_yi,
+    LlavaVideoTemplate(['{{SYSTEM}} '], ['USER: {{QUERY}} ASSISTANT:'], [' '], ['<|im_end|>']),
+    use_model=False,
+    infer_media_type='round',
+    lazy_tokenize=True)
+
+
+def align_image_inputs(input_ids: List[int], labels: List[int], new_input_ids,
+                       image_token: int) -> Tuple[List[int], List[int]]:
+    if isinstance(new_input_ids, torch.Tensor):
+        new_input_ids = new_input_ids.tolist()
+
+    # Find the tokens after the image_token in input_ids, and then align them.
+    i, j = 0, 0
+    while i < len(input_ids):
+        x = input_ids[i]
+        if x == image_token:
+            assert i + 1 < len(input_ids), f'input_ids[-10:]: {input_ids[-10:]}'
+            assert i - 1 >= 0, f'input_ids[:10]: {input_ids[:10]}'
+            # [1, 2, 3(i-1), image_token(i), 4(i+1) ,5, 6]
+            # [1, 2, 3(j_begin), a(j'), a, a, a, 4(j) ,5, 6]
+            j_begin = j - 1
+            for k in range(5):  # Increase robustness.
+                if j_begin + k < len(new_input_ids) and new_input_ids[j_begin + k] == input_ids[i - 1]:
+                    j_begin += k
+                    break
+                if j_begin - k >= 0 and new_input_ids[j_begin - k] == input_ids[i - 1]:
+                    j_begin -= k
+                    break
+            else:
+                raise ValueError(f'new_input_ids: {new_input_ids}, input_ids: {input_ids}')
+            j_begin += 1
+            while j < len(new_input_ids) and new_input_ids[j] != input_ids[i + 1]:
+                j += 1
+            input_ids = input_ids[:i] + new_input_ids[j_begin:j] + input_ids[i + 1:]
+            if labels:
+                labels = labels[:i] + [-100] * (j - j_begin) + labels[i + 1:]
+            i += j - j_begin
+        else:
+            j += 1
+        i += 1
+    return input_ids, labels
+
+
+class Idefics3Template(Template):
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+        processor = self.tokenizer.processor
+        prompt = self.tokenizer.decode(inputs['input_ids'])
+        if images:
+            image_inputs = processor(text=prompt, images=images, return_tensors='pt', add_special_tokens=False)
+            image_token = 128257  # <image>
+            inputs['input_ids'], inputs['labels'] = align_image_inputs(inputs['input_ids'], inputs['labels'],
+                                                                       image_inputs['input_ids'][0], image_token)
+            inputs['pixel_values'] = image_inputs['pixel_values']
+        return inputs, {}
+
+
+register_template(
+    TemplateType.idefics3,
+    Idefics3Template(['<|begin_of_text|>'], ['User:{{QUERY}}<end_of_utterance>\nAssistant:'], ['<end_of_utterance>\n'],
+                     ['<end_of_utterance>'], None, ['System:{{SYSTEM}}<end_of_utterance>\n']),
+    use_model=False,
+    lazy_tokenize=True)
+
+
+class Llava1_5Template(LlavaHfTemplate):
+
+    def __init__(self):
+        super().__init__(['<s>'], ['USER: {{QUERY}}\nASSISTANT:'], ['</s>'], ['</s>'])
+
+
+register_template(TemplateType.llava1_5, Llava1_5Template(), use_model=False, lazy_tokenize=True)
+
+
+class LLavaTemplate(Template):
+
+    def __init__(self):
+        # This template follows: https://github.com/haotian-liu/LLaVA/blob/main/llava/conversation.py#L350
+        super().__init__(['<s>[INST] '], ['{{QUERY}} [/INST]'],
+                         None, ['</s>'],
+                         system_prefix=['<<SYS>>\n{{system}}\n<</SYS>>\n\n'])
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return [[-200], '\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images') or []
+        image_sizes = [x.size for x in images]
+        from llava.mm_utils import process_images
+        if images:
+            # image_processor comes from the model.vision_tower.image_processor
+            # config comes from the model.config
+            images_tensor = process_images(images, self.tokenizer.image_processor, self.tokenizer.config)
+            inputs['images'] = images_tensor.to(kwargs['dtype']).squeeze(0)
+            inputs['image_sizes'] = image_sizes
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = images
+            res['image_sizes'] = sum([b['image_sizes'] for b in batch if 'image_sizes' in b], start=[])
+        has_images = [(b == -200).sum() for b in res['input_ids']]
+        assert all([
+            h > 0 for h in has_images
+        ]) or not any([h > 0
+                       for h in has_images]), 'Llava does not support mix-batch nlp dataset and multi-modal dataset'
+        return res
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+class Llava1_6Template(LlavaHfTemplate):
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        for b in batch:
+            pixel_values = b.get('pixel_values')
+            if pixel_values is not None:
+                b['pixel_values'] = pixel_values.squeeze(0)  # 5d -> 4d
+        res = super().data_collator(batch, padding_to)
+        return res
+
+
+class Llava1_6MistralTemplate(Llava1_6Template):
+
+    def __init__(self):
+        super().__init__(['<s>[INST] '], ['{{QUERY}} [/INST]'], ['</s>'], ['</s>'],
+                         system_prefix=['<<SYS>>\n{{system}}\n<</SYS>>\n\n'])
+
+
+class Llava1_6VicunaTemplate(Llava1_6Template):
+    system = ('A chat between a curious human and an artificial intelligence assistant. '
+              "The assistant gives helpful, detailed, and polite answers to the human's questions.")
+
+    def __init__(self):
+        super().__init__(['<s>'], ['USER: {{QUERY}} ASSISTANT:'], ['</s>'], ['</s>'],
+                         self.system,
+                         system_prefix=['<s>{{SYSTEM}} '])
+
+
+register_template(TemplateType.llava_mistral, Llava1_6MistralTemplate(), use_model=False, lazy_tokenize=True)
+
+register_template(TemplateType.llava_vicuna, Llava1_6VicunaTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LLava1_6YiTemplate(Llava1_6Template):
+
+    def __init__(self):
+        super().__init__([], ['<|im_start|>user\n{{QUERY}}<|im_end|><|im_start|>assistant\n'], ['<|im_end|>'],
+                         ['<|im_end|>'],
+                         system_prefix=['<|im_start|>system\n{{SYSTEM}}<|im_end|>'])
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        return super().replace_tag(media_type, index, example)
+
+
+register_template(TemplateType.llava_yi, LLava1_6YiTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class Llama3LlavaNextHfTemplate(Llama3TemplateMixin, Llava1_6Template):
+    pass
+
+
+register_template(TemplateType.llama3_llava_next_hf, Llama3LlavaNextHfTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LlavaQwenHfTemplate(QwenTemplateMixin, Llava1_6Template):
+    pass
+
+
+register_template(TemplateType.llava_qwen_hf, LlavaQwenHfTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LlavaOneVisonTemplate(QwenTemplateMixin, Llava1_6Template):
+    system = None
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = Template._encode(self, example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images')
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, 151646)  # <image>
+        processor = self.tokenizer.processor
+        if images:
+            image_processor = processor.image_processor
+            image_inputs = image_processor(images, return_tensors='pt').to(kwargs['dtype'])
+            height, width = image_inputs['pixel_values'][0].shape[-2:]
+            added_tokens_len = 0
+            for idx, pixel_v, image_size in zip(idx_list, image_inputs['pixel_values'], image_inputs['image_sizes']):
+                orig_height, orig_width = image_size
+                num_image_tokens = processor._get_number_of_features(orig_height, orig_width, height, width)
+                input_ids = input_ids[:added_tokens_len
+                                      + idx] + [151646] * num_image_tokens + input_ids[added_tokens_len + idx + 1:]
+                if labels is not None:
+                    labels = labels[:added_tokens_len + idx] + [-100] * num_image_tokens + labels[added_tokens_len + idx
+                                                                                                  + 1:]
+                added_tokens_len += num_image_tokens - 1
+            inputs['input_ids'] = input_ids
+            inputs['labels'] = labels
+            inputs['pixel_values'] = image_inputs['pixel_values']
+            if 'image_sizes' in image_inputs:
+                inputs['image_sizes'] = image_inputs['image_sizes']
+        return inputs, {}
+
+
+register_template(TemplateType.llava_onevision_qwen, LlavaOneVisonTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LLavaLlamaTemplate(Llama3Template):
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example):
+        return ['<image>\n']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        raw_image = example.get('images')
+        if raw_image:
+            pixel_values = self.tokenizer.processor.image_processor(raw_image, return_tensors='pt')['pixel_values']
+            inputs['pixel_values'] = pixel_values.to(kwargs['dtype'])
+        return inputs, {}
+
+
+register_template(TemplateType.llava_llama_instruct, LLavaLlamaTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class PaliGemmaTemplate(Template):
+
+    def __init__(self):
+        super().__init__([], ['{{QUERY}}\n'], None, ['<eos>'])
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) <= 1
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        assert media_type == 'image'
+        if self._is_vllm:
+            self.prompt = ['{{QUERY}}']
+            return []
+        else:
+            self.prompt = ['{{QUERY}}\n']
+            return ['<image>' * self.tokenizer.processor.image_seq_length + '<bos>']
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        raw_image = example.get('images')
+        processor = self.tokenizer.processor
+        if inputs['labels'] is not None:
+            n = upper_bound(0, len(inputs['labels']), lambda idx: inputs['labels'][idx] == -100)
+            n2 = len(inputs['labels']) - n
+            inputs['token_type_ids'] = [0] * n + [1] * n2
+        else:
+            inputs['token_type_ids'] = [0] * len(inputs['input_ids'])
+        if raw_image:
+            model_inputs = processor(text=example['query'], images=raw_image[0], return_tensors='pt')
+            inputs['pixel_values'] = model_inputs['pixel_values']
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        token_type_ids = [torch.tensor(b['token_type_ids']) for b in batch]
+        token_type_ids = self.pad_sequence(token_type_ids, 0, self.padding_side)
+        res['token_type_ids'] = token_type_ids
+        return res
+
+
+register_template(
+    TemplateType.paligemma, PaliGemmaTemplate(), infer_media_type='dialogue', lazy_tokenize=True, is_generation=True)
+
+
+class Phi3Template(Template):
+
+    def __init__(self):
+        super().__init__([], ['<|user|>\n{{QUERY}}<|end|>\n<|assistant|>\n'], ['<|end|>\n'], ['<|end|>'],
+                         None, ['<|system|>\n{{SYSTEM}}<|end|>\n'],
+                         auto_add_bos=True)
+
+
+register_template(TemplateType.phi3, Phi3Template())
+
+
+class Phi3VisionTemplate(Phi3Template):
+    image_placeholder = ['<|image|><s>\n']  # <|image|>\n
+
+    def replace_tag(self, media_type, index, example) -> List[Context]:
+        return super().replace_tag(media_type, index, example)
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        images = example.get('images') or []
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, 32044)  # '<|image|>'
+
+        if len(images) > 0:
+            processor = self.tokenizer.processor
+            inputs.update(processor.image_processor(images, return_tensors='pt'))
+            assert len(idx_list) == len(images), f'len(idx_list): {len(idx_list)}, len(images): {len(images)}'
+            res_input_ids = []
+            res_labels = []
+            num_img_tokens = inputs.pop('num_img_tokens').tolist()
+            idx_list.insert(0, -1)
+            for i in range(len(idx_list) - 1):
+                image_token_id = -i - 1
+                res_input_ids += input_ids[idx_list[i] + 1:idx_list[i + 1]] + [image_token_id] * num_img_tokens[i]
+                if labels is not None:
+                    res_labels += labels[idx_list[i] + 1:idx_list[i + 1]] + [-100] * num_img_tokens[i]
+            res_input_ids += input_ids[idx_list[-1] + 1:]
+            input_ids = res_input_ids
+            if labels is not None:
+                res_labels += labels[idx_list[-1] + 1:]
+                labels = res_labels
+
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        return inputs, {}
+
+
+register_template(TemplateType.phi3_vl, Phi3VisionTemplate(), lazy_tokenize=True)
+
+
+class Llama3LlavaNextTemplate(Llama3TemplateMixin, LLavaTemplate):
+    system = 'You are a helpful language and vision assistant. ' \
+             'You are able to understand the visual content that the user provides, ' \
+             'and assist the user with a variety of tasks using natural language.'
+
+
+register_template(TemplateType.llama3_llava_next, Llama3LlavaNextTemplate(), use_model=False, lazy_tokenize=True)
+
+
+class LLavaQwenTemplate(QwenTemplateMixin, LLavaTemplate):
+    pass
+
+
+register_template(TemplateType.llava_qwen, LLavaQwenTemplate(), use_model=False, lazy_tokenize=True)
+
+
+def _findall(token_list: List[int], sub_token_list: Union[int, List[int]]) -> List[int]:
+    """Find the index of a token in the token_list."""
+    if isinstance(sub_token_list, int):
+        sub_token_list = [sub_token_list]
+    res = []
+    idx = -1
+    try:
+        while True:
+            idx = token_list.index(sub_token_list[0], idx + 1)
+            if len(sub_token_list) == 1 or sub_token_list == token_list[idx:idx + len(sub_token_list)]:
+                res.append(idx)
+    except ValueError:
+        pass
+    return res
+
+
+class DeepseekVLTemplate(Template):
+    DEEPSEEK_VL_SYSTEM = ('You are a helpful language and vision assistant. '
+                          'You are able to understand the visual content that the user provides, '
+                          'and assist the user with a variety of tasks using natural language.')
+
+    image_placeholder = ['<image_placeholder>']
+
+    def __init__(self):
+        super().__init__(['<｜begin▁of▁sentence｜>{{SYSTEM}}\n\n'], ['User: {{QUERY}}\n\nAssistant:'],
+                         ['<｜end▁of▁sentence｜>'], ['<｜end▁of▁sentence｜>'], self.DEEPSEEK_VL_SYSTEM)
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images')
+        processor = self.tokenizer.processor
+        input_ids, labels = inputs['input_ids'], inputs['labels']
+        idx_list = _findall(input_ids, processor.image_id)  # '<image_placeholder>'
+        new_input_ids, new_labels = [], []
+        lo = 0
+        for hi in idx_list:
+            new_input_ids += input_ids[lo:hi]
+            if labels is not None:
+                new_labels += labels[lo:hi]
+            new_input_ids += [processor.image_id] * processor.num_image_tokens
+            new_labels += [-100] * processor.num_image_tokens
+            lo = hi + 1
+        new_input_ids += input_ids[lo:]
+        if labels is not None:
+            new_labels += labels[lo:]
+        else:
+            new_labels = None
+        from deepseek_vl.models.processing_vlm import VLChatProcessorOutput
+        images_outputs = processor.image_processor(images, return_tensors='pt')
+        output = VLChatProcessorOutput(
+            sft_format=None,
+            input_ids=torch.tensor(new_input_ids),
+            pixel_values=images_outputs.pixel_values,
+            num_image_tokens=torch.tensor([processor.num_image_tokens] * len(idx_list)))
+        batched_output = dict(processor.batchify([output]))
+        batched_output['pixel_values'] = batched_output['pixel_values'].to(dtype=kwargs['dtype'])
+        inputs = {'input_ids': new_input_ids, 'labels': new_labels, '_data': batched_output}
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        inputs_embeds = model.prepare_inputs_embeds(**data)[0]
+        return {'inputs_embeds': inputs_embeds}
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+register_template(TemplateType.deepseek_vl, DeepseekVLTemplate(), use_model=False, lazy_tokenize=True)
+
+register_template(
+    TemplateType.zephyr,
+    Template([], ['<|user|>\n{{QUERY}}</s>\n<|assistant|>\n'], ['</s>\n'], ['</s>'], None,
+             ['<|system|>\n{{SYSTEM}}</s>\n']))
+
+register_template(
+    TemplateType.sus,
+    Template(['{{SYSTEM}}'], ['### Human: {{QUERY}}\n\n### Assistant: '], ['<|endoftext|>'], ['<|endoftext|>']))
+
+register_template(TemplateType.orion,
+                  Template(['<s>{{SYSTEM}}'], ['Human: {{QUERY}}\n\nAssistant: </s>'], ['</s>'], ['</s>']))
+
+
+class CogTemplate(Template):
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) <= 1
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        return []
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        image = example.get('images') or []
+        inputs.pop('loss_scale', None)
+        inputs2 = self.tokenizer.build_conversation_input_ids(
+            self.tokenizer, query=example['query'], history=example.get('history'), images=image)
+        image_token_len = inputs2['token_type_ids'].sum().item()
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        inputs['token_type_ids'] = [0] + [1] * image_token_len + [0] * len(input_ids[1:])
+        inputs['input_ids'] = input_ids[:1] + [self.tokenizer.pad_token_id] * image_token_len + input_ids[1:]
+        if labels is not None:
+            inputs['labels'] = labels[:1] + [-100] * image_token_len + labels[1:]
+        if len(image) > 0:
+            inputs['images'] = [[img.to(dtype=kwargs['dtype'])] for img in inputs2['images']]
+            if 'cross_images' in inputs2:
+                # is cogagent
+                inputs['cross_images'] = [[cross_img.to(dtype=kwargs['dtype'])] for cross_img in inputs2['cross_images']]
+        return inputs, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        keys = ['images', 'cross_images']
+        for key in keys:
+            if key in batch[0]:
+                res[key] = [b[key][0] for b in batch]
+        token_type_ids = [torch.tensor(b['token_type_ids']) for b in batch]
+        token_type_ids = self.pad_sequence(token_type_ids, 0, self.padding_side)
+        res['token_type_ids'] = token_type_ids
+        return res
+
+
+register_template(
+    TemplateType.cogagent_chat,
+    CogTemplate(['<s>'], [' [INST] {{QUERY}} [/INST] '], [], ['</s>']),
+    use_model=False,
+    infer_media_type='dialogue',
+    lazy_tokenize=True)
+
+register_template(
+    TemplateType.cogagent_instruct,
+    CogTemplate(['<s>'], ['<EOI>Question: {{QUERY}} Answer:'], None, ['</s>']),
+    use_model=False,
+    infer_media_type='dialogue',
+    lazy_tokenize=True)
+
+register_template(
+    TemplateType.cogvlm,
+    CogTemplate([['bos_token_id']], ['Question: {{QUERY}} Answer:'], ['\n'], [['eos_token_id']]),
+    use_model=False,
+    infer_media_type='dialogue',
+    lazy_tokenize=True)
+
+
+class Cog2VideoTemplate(CogTemplate):
+
+    def check_example(self, example):
+        videos = example.get('videos') or []
+        assert len(videos) <= 1
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super(CogTemplate, self)._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        videos_path = example.get('videos') or []
+        video = load_batch(videos_path, load_video_cogvlm2)
+        inputs.pop('loss_scale', None)
+        inputs2 = self.tokenizer.build_conversation_input_ids(
+            self.tokenizer,
+            query=example['query'],
+            history=example.get('history'),
+            images=video,
+            template_version='chat')
+        video_token_len = inputs2['token_type_ids'].sum().item()
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        inputs['token_type_ids'] = [0] + [1] * video_token_len + [0] * len(input_ids[1:])
+        inputs['input_ids'] = input_ids[:1] + [self.tokenizer.pad_token_id] * video_token_len + input_ids[1:]
+        if labels is not None:
+            inputs['labels'] = labels[:1] + [-100] * video_token_len + labels[1:]
+        if len(video) > 0:
+            inputs['images'] = [[img.to(dtype=kwargs['dtype'])] for img in inputs2['images']]
+        return inputs, {}
+
+
+register_template(
+    TemplateType.cogvlm2_video,
+    Cog2VideoTemplate([['bos_token_id']], ['Question: {{QUERY}} Answer:'], ['\n'], [['eos_token_id']]),
+    use_model=False,
+    infer_media_type='dialogue',
+    lazy_tokenize=True,
+    media_type='video')
+
+register_template(TemplateType.minicpm, Template(['<s>{{SYSTEM}}'], ['<用户>{{QUERY}}<AI>'], [], ['</s>']))
+
+
+def _remove_idx(arr: List[int], idx_list: List[int]) -> List[int]:
+    res = []
+    idx_set = set(idx_list)
+    for i, x in enumerate(arr):
+        if i not in idx_set:
+            res.append(x)
+    return res
+
+
+class MiniCPMVTemplate(Template):
+    is_v2_5 = False
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        return [[-100]]
+
+    def check_example(self, example):
+        images = example.get('images') or []
+        assert len(images) == 1
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example['images']
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, -100)
+        idx = idx_list[0]
+        tgt_sizes = None
+        slice_mode = getattr(self.tokenizer.config, 'slice_mode', False)
+        if slice_mode:
+            if self.is_v2_5:
+                image_processor = self.tokenizer.processor.image_processor
+                image_inputs = image_processor(images, return_tensors='pt').to(kwargs['dtype'])
+                placeholder = image_processor.get_slice_image_placeholder(image_inputs.image_sizes[0][0])
+                pixel_values = image_inputs['pixel_values']
+                tgt_sizes = image_inputs['tgt_sizes']
+            else:
+                # Comes from model.get_slice_image_placeholder and model.transform
+                images, placeholder = self.tokenizer.get_slice_image_placeholder(images[0], self.tokenizer)
+                pixel_values = [[self.tokenizer.transform(img) for img in images]]
+            placeholder += '\n'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
+            if labels is not None:
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
+            input_tensor_ids = torch.tensor(input_ids)
+            image_start_idx = torch.where(input_tensor_ids == self.tokenizer.im_start_id)[0]
+            image_start_idx += 1
+            image_end_idx = torch.where(input_tensor_ids == self.tokenizer.im_end_id)[0]
+            valid_image_nums = max(len(image_start_idx), len(image_end_idx))
+            image_bound = [
+                torch.hstack(
+                    [image_start_idx[:valid_image_nums].unsqueeze(-1), image_end_idx[:valid_image_nums].unsqueeze(-1)])
+            ]
+        else:
+            placeholder = '<image>' + '<unk>' * self.tokenizer.config.query_num + '</image>\n'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            input_ids = (input_ids[:idx] + placeholder_id + input_ids[idx + 1:])
+            if labels is not None:
+                labels = (labels[:idx] + [-100] * len(placeholder_id) + labels[idx + 1:])
+            image_bound = [torch.tensor([[idx, idx + self.tokenizer.config.query_num]])]
+            pixel_values = [[self.tokenizer.transform(images[0])]]
+        inputs = {
+            'input_ids': input_ids,
+            'labels': labels,
+            '_data': {
+                'input_ids': torch.tensor(input_ids)[None],
+                'image_bound': image_bound,
+                'pixel_values': pixel_values,
+                'tgt_sizes': tgt_sizes
+            }
+        }
+        return inputs, {}
+
+    def post_encode(self, model, data: Any) -> Dict[str, Any]:
+        inputs_embeds, _ = model.get_vllm_embedding(data)
+        return {'inputs_embeds': inputs_embeds[0]}
+
+    @staticmethod
+    def _get_generate_ids(generate_ids: List[int], input_token_len: int) -> List[int]:
+        return generate_ids
+
+
+class MiniCPMV2_6Template(QwenTemplateMixin, MiniCPMVTemplate):
+
+    def check_example(self, example):
+        pass
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type in {'image', 'video'}
+        max_num_frames = get_env_args('max_num_frames', int, 64)
+        load_video = partial(load_video_minicpmv_mplug_owl3, max_num_frames=max_num_frames)
+        image_context = super().replace_tag('image', index, example)
+        if media_type == 'image':
+            return image_context
+        elif media_type == 'video':
+            return _replace_video2image(load_video, example, lambda i: image_context)
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = Template._encode(self, example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example.get('images')
+        use_video = bool(example.get('videos'))
+        is_plain_text = not images and not use_video
+        use_image_id = True
+        max_slice_nums = None
+
+        if use_video:
+            use_image_id = False
+            max_slice_nums = 1  # or 2
+
+        max_slice_nums = get_env_args('max_slice_nums', int, max_slice_nums)
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, -100)
+        idx_list.insert(0, -1)
+
+        image_processor = self.tokenizer.processor.image_processor
+        image_inputs = image_processor([images], return_tensors='pt',
+                                       max_slice_nums=max_slice_nums).to(kwargs['dtype'])
+
+        res_input_ids = []
+        res_labels = []
+        for i in range(len(idx_list) - 1):
+            placeholder = image_processor.get_slice_image_placeholder(
+                image_inputs.image_sizes[0][i], image_idx=i, max_slice_nums=max_slice_nums, use_image_id=use_image_id)
+            placeholder += '\n'
+            placeholder_id = self.tokenizer.encode(placeholder, add_special_tokens=False)
+            res_input_ids += input_ids[idx_list[i] + 1:idx_list[i + 1]] + placeholder_id
+            if labels is not None:
+                res_labels += labels[idx_list[i] + 1:idx_list[i + 1]] + [-100] * len(placeholder_id)
+        res_input_ids += input_ids[idx_list[-1] + 1:]
+        input_ids = res_input_ids
+        if labels is not None:
+            res_labels += labels[idx_list[-1] + 1:]
+            labels = res_labels
+        if not is_plain_text:
+            input_tensor_ids = torch.tensor(input_ids)
+            unk_token = self.tokenizer.encode('<unk>', add_special_tokens=False)[0]
+            indices = (input_tensor_ids == unk_token).nonzero(as_tuple=True)[0].tolist()
+
+            ranges = []
+            start = indices[0]
+            for i in range(1, len(indices)):
+                if indices[i] != indices[i - 1] + 1:
+                    ranges.append([start, indices[i - 1] + 1])
+                    start = indices[i]
+            ranges.append([start, indices[-1] + 1])
+            image_bound = [torch.tensor(ranges)]
+        else:
+            image_bound = [[]]
+
+        inputs = {
+            'input_ids': input_ids,
+            'labels': labels,
+            '_data': {
+                'input_ids': torch.tensor(input_ids)[None],
+                'image_bound': image_bound,
+                'pixel_values': image_inputs['pixel_values'],
+                'tgt_sizes': image_inputs['tgt_sizes']
+            }
+        }
+        return inputs, {}
+
+
+register_template(TemplateType.minicpm_v_v2_6, MiniCPMV2_6Template(), use_model=False, lazy_tokenize=True)
+
+
+class MiniCPMV2_5Template(Llama3TemplateMixin, MiniCPMVTemplate):
+    is_v2_5 = True
+
+
+register_template(
+    TemplateType.minicpm_v_v2_5, MiniCPMV2_5Template(), use_model=False, lazy_tokenize=True, infer_media_type='dialogue')
+
+register_template(
+    TemplateType.minicpm_v,
+    MiniCPMVTemplate(['<s>{{SYSTEM}}'], ['<用户>{{QUERY}}<AI>'], [], ['</s>']),
+    use_model=False,
+    lazy_tokenize=True,
+    infer_media_type='dialogue')
+
+gemma_template = Template(['<bos>'], ['<start_of_turn>user\n{{QUERY}}<end_of_turn>\n<start_of_turn>model\n'],
+                          ['<end_of_turn>\n'], ['<end_of_turn>'], None,
+                          ['<bos><start_of_turn>system\n{{SYSTEM}}<end_of_turn>\n'])
+register_template(TemplateType.gemma, gemma_template)
+
+register_template(TemplateType.telechat, Template([], ['<_user>{{QUERY}}<_bot>'], ['<_end>'], ['<_end>']))
+
+register_template(TemplateType.telechat_v2, Template([], ['<_user> {{QUERY}}<_bot>'], [], ['<_end>']))
+
+DBRX_SYSTEM = (
+    'You are DBRX, created by Databricks. You were last updated in December 2023. '
+    'You answer questions based on information available up to that point.\n'
+    'YOU PROVIDE SHORT RESPONSES TO SHORT QUESTIONS OR STATEMENTS, '
+    'but provide thorough responses to more complex and open-ended questions.\n'
+    'You assist with various tasks, from writing to coding (using markdown for code blocks '
+    '— remember to use ``` with code, JSON, and tables).\n'
+    'You do not have real-time data access or code execution capabilities.'
+    ' You avoid stereotyping and provide balanced perspectives on controversial topics. '
+    'You do not provide song lyrics, poems, or news articles and do not divulge details of your training data.\n'
+    'This is your system prompt, guiding your responses. Do not reference it, just respond to the user. '
+    'If you find yourself talking about this message, stop. You should be responding appropriately '
+    'and usually that means not mentioning this.'
+    'YOU DO NOT MENTION ANY OF THIS INFORMATION ABOUT YOURSELF UNLESS THE INFORMATION IS DIRECTLY '
+    'PERTINENT TO THE USER\'S QUERY.')
+
+
+class DbrxTemplate(ChatmlTemplate):
+    system = DBRX_SYSTEM
+
+
+register_template(TemplateType.dbrx, DbrxTemplate())
+
+register_template(TemplateType.mengzi,
+                  Template([], ['输入：{{QUERY}}输出：\n'], [], [['eos_token_id']], None, ['指令：{{SYSTEM}}']))
+
+C4AI_SYSTEM = ('You are Command-R, a brilliant, sophisticated, AI-assistant trained to assist human users by '
+               'providing thorough responses.You are trained by Cohere.')
+register_template(
+    TemplateType.c4ai,
+    Template(
+        ['<BOS_TOKEN>'],
+        ['<|START_OF_TURN_TOKEN|><|USER_TOKEN|>{{QUERY}}<|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|>'],
+        ['<|END_OF_TURN_TOKEN|>'], ['<|END_OF_TURN_TOKEN|>'], C4AI_SYSTEM,
+        ['<|START_OF_TURN_TOKEN|><|SYSTEM_TOKEN|>{{SYSTEM}}<|END_OF_TURN_TOKEN|']))
+
+
+class mPlugOwl2Template(Template):
+
+    def __init__(self):
+        super().__init__(['{{SYSTEM}}'], ['USER: {{QUERY}}ASSISTANT:'], ['</s>'], [['eos_token_id']])
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type == 'image'
+        return [[-200]]
+
+    def _encode(self, example: Dict[str, Any], **kwargs) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        from mplug_owl2.mm_utils import process_images
+        processor = self.tokenizer.processor
+        images = example.get('images') or []
+        for i, image in enumerate(images):
+            # ref: https://modelscope.cn/models/iic/mPLUG-Owl2.1
+            max_edge = max(image.size)
+            image = image.resize((max_edge, max_edge))
+            images[i] = image
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        if images:
+            images = process_images(images, processor)
+            images = images.to(kwargs['dtype'])
+            return {'input_ids': input_ids, 'labels': labels, 'images': images}, {}
+        else:
+            return {'input_ids': input_ids, 'labels': labels}, {}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        images = [b['images'] for b in batch if 'images' in b]
+        if images:
+            res['images'] = torch.concat(images)
+        return res
+
+
+register_template(
+    TemplateType.mplug_owl2, mPlugOwl2Template(), infer_media_type='round', use_model=False, lazy_tokenize=True)
+
+
+class mPlugOwl3Template(QwenTemplateMixin, Template):
+    system = None
+
+    def _get_image_token_list(self, cut_shape):
+        processor = self.tokenizer.processor
+        text = processor.image_processor.cut_prompt_template(img_token='<|image|>', h=cut_shape[0], w=cut_shape[1])
+        text_list = text.split('<|image|>')
+        if text_list[-1] == '':
+            text_list.pop()
+        res_text_list = []
+        for text in text_list:
+            res_text_list += [text, '<|image|>']
+        token_list = self._encode_context_list(res_text_list)[0]
+        return token_list
+
+    def replace_tag(self, media_type: Literal['image', 'video', 'audio'], index, example) -> List[Context]:
+        assert media_type in {'image', 'video'}
+        max_num_frames = get_env_args('max_num_frames', int, 16)
+        load_video = partial(load_video_minicpmv_mplug_owl3, max_num_frames=max_num_frames)
+        if media_type == 'image':
+            return [[-100], '\n']
+        elif media_type == 'video':
+            return _replace_video2image(load_video, example, lambda i: [[-100]]) + ['\n']
+
+    def _encode(self, example: Dict[str, Any]) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, _ = super()._encode(example)
+        if len(inputs) == 0:
+            return inputs, {}
+        images = example['images']
+        videos = example['videos']
+        cut_enable = not videos
+        input_ids = inputs['input_ids']
+        labels = inputs['labels']
+        idx_list = _findall(input_ids, -100)
+        processor = self.tokenizer.processor
+        if images:
+            image_inputs = processor.image_processor(images, cut_enable=cut_enable, return_tensors='pt')
+            added_tokens_len = 0
+            cut_shapes = image_inputs['cut_shape'] or [None] * len(idx_list)
+            image_token_list = self.tokenizer.encode('<|image|>', add_special_tokens=False)
+            for idx, cut_shape in zip(idx_list, cut_shapes):
+                if cut_shape:
+                    token_list = self._get_image_token_list(cut_shape)
+                else:
+                    token_list = image_token_list
+                input_ids = input_ids[:idx + added_tokens_len] + token_list + input_ids[added_tokens_len + idx + 1:]
+                if labels:
+                    labels = labels[:idx + added_tokens_len] + [-100] * len(token_list) + labels[added_tokens_len + idx
+                                                                                                 + 1:]
+                added_tokens_len += len(token_list) - 1
+            image_token_idx = torch.tensor(_findall(input_ids, image_token_list))[None]
+            _range = torch.arange(len(input_ids))[:, None]
+            matrix = (_range > image_token_idx).sum(dim=1)
+            media_offset = torch.stack([torch.zeros(matrix.shape[0], dtype=torch.long), matrix], dim=-1)[None]
+            inputs['_data'] = {'pixel_values': image_inputs['pixel_values']}
+            inputs['media_offset'] = media_offset
+        inputs['input_ids'] = input_ids
+        inputs['labels'] = labels
+        return inputs, {}
+
+    def _post_encode(self, model, data: Any) -> Dict[str, Any]:
+        image_embeds = model.forward_image(data['pixel_values'])
+        return {'image_embeds': image_embeds}
+
+    def data_collator(self, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = super().data_collator(batch, padding_to)
+        image_embeds = [b['image_embeds'] for b in batch if 'image_embeds' in b]
+        if image_embeds:
+            res['image_embeds'] = torch.concat(image_embeds)
+        media_offset = [b['media_offset'] for b in batch if 'media_offset' in b]
+        if media_offset:
+            res['media_offset'] = torch.concat(media_offset)
+        return res
+
+
+register_template(TemplateType.mplug_owl3, mPlugOwl3Template(), use_model=False, lazy_tokenize=True)
+
+register_template(TemplateType.wizardlm2_awq,
+                  Template(['{{SYSTEM}}'], ['User:\n{{QUERY}}\n\nAssistant:\n'], ['\n\n'], ['</s>']))
+
+_wizardlm2_system = ('A chat between a curious user and an artificial intelligence assistant. '
+                     'The assistant gives helpful, detailed, and polite answers to the user\'s questions. ')
+register_template(TemplateType.wizardlm2,
+                  Template(['{{SYSTEM}}'], ['USER: {{QUERY}} ASSISTANT:'], ['</s>'], ['</s>'], _wizardlm2_system))
+
+register_template(TemplateType.atom,
+                  Template(['{{SYSTEM}}'], ['<s>Human: {{QUERY}}\n</s><s>Assistant: '], ['</s>'], ['</s>']))
+
+
+class RLHFTemplateMixin:
+
+    def encode(self: Template,
+               example: Dict[str, Any],
+               streaming: bool = False) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        template_encode = self._old_encode
+        inputs = {}
+        tokenizer_kwargs = {}
+        chosen_example, rejected_example = example, example.copy()
+        rejected_example['response'] = example['rejected_response']
+        if streaming:
+            chosen_inputs, chosen_tokenizer_kwargs = template_encode(chosen_example), {}
+            rejected_inputs, rejected_tokenizer_kwargs = template_encode(rejected_example), {}
+        else:
+            chosen_inputs, chosen_tokenizer_kwargs = template_encode(chosen_example)
+            rejected_inputs, rejected_tokenizer_kwargs = template_encode(rejected_example)
+
+        for suffix, res in zip(['inputs', 'tokenizer_kwargs'], [inputs, tokenizer_kwargs]):
+            for prefix in ['chosen', 'rejected']:
+                data = locals()[f'{prefix}_{suffix}']
+                for k, v in data.items():
+                    res[f'{prefix}_{k}'] = v
+        return inputs, tokenizer_kwargs
+
+    def data_collator(self: Template, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        _data_collator = self._old_data_collator
+        new_batch = []
+        for prefix in ['chosen_', 'rejected_']:
+            for inputs in batch:
+                new_inputs = {}
+                for k, v in inputs.items():
+                    if k.startswith(prefix):
+                        new_k = k[len(prefix):]
+                        new_inputs[new_k] = inputs[k]
+                if len(new_inputs) > 0:
+                    new_batch.append(new_inputs)
+        assert len(new_batch) in {0, len(batch) * 2}, f'new_batch: {new_batch}'
+        return _data_collator(new_batch or batch, padding_to)
+
+
+class KTOTemplateMixin:
+
+    def encode(self: Template,
+               example: Dict[str, Any],
+               streaming: bool = False) -> Tuple[Dict[str, Any], Dict[str, Any]]:
+        inputs, tokenizer_kwargs = self._old_encode(example, streaming)
+        if len(inputs) > 0:
+            inputs['label'] = example['label']
+        return inputs, tokenizer_kwargs
+
+    def data_collator(self: Template, batch: List[Dict[str, Any]], padding_to: Optional[int] = None) -> Dict[str, Any]:
+        res = {}
+        for prefix in ['', 'KL_']:
+            new_batch = []
+            for b in batch:
+                new_batch.append({'input_ids': b[f'{prefix}input_ids'], 'labels': b[f'{prefix}labels']})
+            for k, v in self._old_data_collator(new_batch, padding_to).items():
+                res[f'{prefix}completion_{k}'] = v
+        res['label'] = [b['label'] for b in batch]
+        return res
diff --git a/modelscope/preprocessors/templates/tools_prompt.py b/modelscope/preprocessors/templates/tools_prompt.py
new file mode 100644
index 00000000..35cb73dd
--- /dev/null
+++ b/modelscope/preprocessors/templates/tools_prompt.py
@@ -0,0 +1,107 @@
+from typing import List, Dict, Union, Optional
+
+
+def format_react_en(tool_names, tool_descs):
+    REACT_PROMPT = """Answer the following questions as best as you can. You have access to the following tools:
+
+    {tool_list}
+
+    Use the following format:
+
+    Thought: you should always think about what to do
+    Action: the action to take, should be one of [{tool_names}]
+    Action Input: the input to the action
+    Observation: the result of the action
+    ... (this Thought/Action/Action Input/Observation can be repeated zero or more times)
+    Final Answer: the final answer to the original input question
+
+    Begin!
+    """
+    return REACT_PROMPT.format(tool_list='\n\n'.join(tool_descs), tool_names=','.join(tool_names))
+
+
+def format_react_zh(tool_names, tool_descs):
+    REACT_ZH_PROMPT = """尽你所能回答以下问题。你拥有如下工具：
+
+    {tool_list}
+
+    使用以下格式回答：
+
+    Thought: 思考你应该做什么
+    Action: 工具的名称，必须是[{tool_names}]之一
+    Action Input: 工具的输入
+    Observation: 工具返回的结果
+    ... (Thought/Action/Action Input/Observation的过程可以重复零次或多次)
+    Final Answer: 对输入问题的最终答案
+
+    开始！
+    """
+    return REACT_ZH_PROMPT.format(tool_list='\n\n'.join(tool_descs), tool_names=','.join(tool_names))
+
+
+def format_glm4(tool_names, tool_descs):
+    GLM4_PROMPT = '''你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，你的任务是针对用户的问题和要求提供适当的答复和支持。
+
+    # 可用工具
+
+    {tool_list}'''
+    tool_list = ''
+    for name, tool in zip(tool_names, tool_descs):
+        tool_list += f'## {name}\n\n{tool}\n\n'
+    return GLM4_PROMPT.format(tool_list=tool_list)
+
+
+def format_toolbench(tool_names, tool_descs):
+    TOOLBENCH_PROMPT = '''You can use many tools(functions) to do the following task.
+    First I will give you the task description, and your task start.
+    At each step, you need to give your thought to analyze the status now and what to do next, \
+    with a function call to actually excute your step. Your output should follow this format:
+    Thought:
+    Action:
+    Action Input:
+
+    After the call, you will get the call result, and you are now in a new state.
+    Then you will analyze your status now, then decide what to do next...
+    After many (Thought-call) pairs, you finally perform the task, then you can give your finial answer.
+    Remember:
+    1.the state change is irreversible, you can't go back to one of the former state, if you want to restart the task, \
+    say \"I give up and restart\".
+    2.All the thought is short, at most in 5 sentence.
+    3.You can do more then one trys, so if your plan is to continusly try some conditions, \
+    you can do one of the conditions per try.
+    Let's Begin!
+    Task description: You should use functions to help handle the real time user querys. Remember:
+    1.ALWAYS call \"Finish\" function at the end of the task. And the final answer should contain enough information \
+    to show to the user,If you can't handle the task, \
+    or you find that function calls always fail(the function is not valid now), \
+    use function Finish->give_up_and_restart.
+    2.Do not use origin tool names, use only subfunctions' names.
+    Specifically, you have access to the following APIs: {tool_list}'''
+    return TOOLBENCH_PROMPT.format(tool_list='\n\n'.join(tool_descs))
+
+
+tools_prompt = {
+    'react_en': format_react_en,
+    'react_zh': format_react_zh,
+    'glm4': format_glm4,
+    'toolbench': format_toolbench,
+}
+
+
+def get_tools_prompt(TOOLS: List[Dict[str, Union[str, dict]]], prompt_format: str = 'react_en') -> Optional[str]:
+    tool_descs = []
+    tool_names = []
+    for info in TOOLS:  # info: Dict[str, Union[str, dict]]
+        try:
+            if 'function' in info:
+                info = info['function']
+            tool_names.append(info['name'])
+            tool_descs.append(str(info))  # info: dict
+        except KeyError:
+            print('invalid tools format, please check'
+                  'https://github.com/modelscope/swift/blob/main/docs/source_en/LLM/Agent-deployment-best-practice.md')
+            return None
+    prompt_format = tools_prompt.get(prompt_format) or format_toolbench
+    return prompt_format(tool_names, tool_descs)
+
+
diff --git a/modelscope/preprocessors/templates/utils.py b/modelscope/preprocessors/templates/utils.py
new file mode 100644
index 00000000..a753b8f5
--- /dev/null
+++ b/modelscope/preprocessors/templates/utils.py
@@ -0,0 +1,542 @@
+import base64
+import hashlib
+import math
+import os
+import re
+from collections.abc import Mapping
+from copy import deepcopy
+from io import BytesIO
+from typing import Any, Callable, List, TypeVar, Union, Tuple, Set, Dict, Type, Optional, Sequence
+
+import numpy as np
+import requests
+import torch
+from packaging import version
+
+
+History = List[Union[Tuple[str, str], List[str]]]
+Prompt = List[Union[str, List[int], List[str]]]
+StopWords = Prompt
+Context = Union[str, List[int]]
+Messages = List[Dict[str, Union[str, List[Dict]]]]
+
+
+# >>> internvl
+IMAGENET_MEAN = (0.485, 0.456, 0.406)
+IMAGENET_STD = (0.229, 0.224, 0.225)
+
+
+def split_str_parts_by(text: str, delimiters: List[str]):
+    """Split the text field into parts.
+
+    Args:
+        text: A text to be split.
+        delimiters: The delimiters.
+
+    Returns:
+        The split text in list of dicts.
+    """
+    assert isinstance(text, str), f'text: {text}'
+    all_start_chars = [d[0] for d in delimiters]
+    all_length = [len(d) for d in delimiters]
+
+    text_list = []
+    last_words = ''
+
+    while len(text) > 0:
+        for char_idx, char in enumerate(text):
+            match_index = [idx for idx, start_char in enumerate(all_start_chars) if start_char == char]
+            is_delimiter = False
+            for index in match_index:
+                if text[char_idx:char_idx + all_length[index]] == delimiters[index]:
+                    if text_list:
+                        text_list[-1]['content'] = last_words
+                    elif last_words:
+                        text_list.append({'key': '', 'content': last_words})
+                    last_words = ''
+                    text_list.append({'key': delimiters[index]})
+                    text = text[char_idx + all_length[index]:]
+                    is_delimiter = True
+                    break
+            if not is_delimiter:
+                last_words += char
+            else:
+                break
+        if last_words == text:
+            text = ''
+
+    if len(text_list):
+        text_list[-1]['content'] = last_words
+    else:
+        text_list.append({'key': '', 'content': last_words})
+    return text_list
+
+
+def split_parts_by_regex(text_list: list, regex_delimiters: Dict[str, List[float]]) -> None:
+    import re
+    compiled_patterns = [(re.compile(pattern), scale) for pattern, scale in regex_delimiters.items()]
+    for i in range(len(text_list) - 1, -1, -1):
+        item = text_list[i]
+        if item.get('key') == '':
+            res_text = item['content']
+            last_idx = 0
+            segments = []
+
+            for pattern, scale in compiled_patterns:
+                matches = list(re.finditer(pattern, res_text))
+                for match in matches:
+                    if match.start() > last_idx:
+                        segments.append({'key': '', 'content': res_text[last_idx:match.start()]})
+                    segments.append({'key': scale[0], 'content': match.group(0)})
+                    last_idx = match.end()
+
+            if last_idx < len(res_text):
+                segments.insert(0, {'key': '', 'content': res_text[last_idx:]})
+
+            if segments:
+                text_list[i:i + 1] = segments
+
+
+def _decode_prompt(prompt: str, tmp_dir: str = 'tmp') -> str:
+    pattern = r'<(?:img|audio|video)>(.+?)</(?:img|audio|video)>'
+    match_iter = re.finditer(pattern, prompt)
+    new_content = ''
+    idx = 0
+    for m in match_iter:
+        span = m.span(1)
+        img_base64 = m.group(1)
+        img_path = _from_base64(img_base64, tmp_dir)
+        new_content += prompt[idx:span[0]] + img_path
+        idx = span[1]
+    new_content += prompt[idx:]
+    return new_content
+
+
+def _to_base64(img_path: Union[str, 'PIL.Image.Image', bytes]) -> str:
+    if isinstance(img_path, str) and not os.path.isfile(img_path):
+        # base64
+        return img_path
+    if isinstance(img_path, str):
+        # local_path
+        with open(img_path, 'rb') as f:
+            _bytes = f.read()
+    elif not isinstance(img_path, bytes):  # PIL.Image.Image
+        bytes_io = BytesIO()
+        img_path.save(bytes_io, format='png')
+        _bytes = bytes_io.getvalue()
+    else:
+        _bytes = img_path
+    img_base64: str = base64.b64encode(_bytes).decode('utf-8')
+    return img_base64
+
+
+def _from_base64(img_base64: Union[str, 'PIL.Image.Image'], tmp_dir: str = 'tmp') -> str:
+    from PIL import Image
+    if not isinstance(img_base64, str):  # PIL.Image.Image
+        img_base64 = _to_base64(img_base64)
+    if os.path.isfile(img_base64) or img_base64.startswith('http'):
+        return img_base64
+    sha256_hash = hashlib.sha256(img_base64.encode('utf-8')).hexdigest()
+    img_path = os.path.join(tmp_dir, f'{sha256_hash}.png')
+    image = Image.open(BytesIO(base64.b64decode(img_base64)))
+    if not os.path.exists(img_path):
+        image.save(img_path)
+    return img_path
+
+
+def decode_base64(*,
+                  messages: Optional[Messages] = None,
+                  prompt: Optional[str] = None,
+                  images: Optional[List[str]] = None,
+                  tmp_dir: str = 'tmp') -> Dict[str, Any]:
+    # base64 -> local_path
+    os.makedirs(tmp_dir, exist_ok=True)
+    res = {}
+    if messages is not None:
+        res_messages = []
+        for m in messages:
+            m_new = deepcopy(m)
+            m_new['content'] = _decode_prompt(m_new['content'], tmp_dir)
+            res_messages.append(m_new)
+        res['messages'] = res_messages
+    if prompt is not None:
+        prompt = _decode_prompt(prompt, tmp_dir)
+        res['prompt'] = prompt
+    if images is not None:
+        res_images = []
+        for image in images:
+            image = _from_base64(image, tmp_dir)
+            res_images.append(image)
+        res['images'] = res_images
+    return res
+
+
+def to_device(inputs: Any, device: torch.device) -> Any:
+    """Move inputs to a device"""
+    if callable(getattr(inputs, 'to', None)):
+        return inputs.to(device=device)
+
+    if isinstance(inputs, Mapping):
+        res = {}
+        for k, v in inputs.items():
+            res[k] = to_device(v, device)
+    elif isinstance(inputs, Sequence) and not isinstance(inputs, str):
+        res = []
+        for b in inputs:
+            res.append(to_device(b, device))
+    else:
+        res = inputs
+    return res
+
+
+def upper_bound(lo: int, hi: int, cond: Callable[[int], bool]) -> int:
+    # The upper bound satisfying the condition "cond".
+    while lo < hi:
+        mid = (lo + hi + 1) >> 1  # lo + (hi-lo+1)>>1
+        if cond(mid):
+            lo = mid
+        else:
+            hi = mid - 1
+    return lo
+
+
+def fetch_one(element: Union[Tuple, List, Set, Dict, Any], type: Type = None) -> Any:
+    if isinstance(element, (tuple, set, list)):
+        for ele in element:
+            out = fetch_one(ele)
+            if out and (type is None or isinstance(out, type)):
+                return out
+    elif isinstance(element, dict):
+        return fetch_one(list(element.values()))
+    else:
+        return element
+
+
+def _build_transform(input_size):
+    import torchvision.transforms as T
+    from torchvision.transforms.functional import InterpolationMode
+    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
+    transform = T.Compose([
+        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
+        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
+        T.ToTensor(),
+        T.Normalize(mean=MEAN, std=STD)
+    ])
+    return transform
+
+
+def _find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
+    best_ratio_diff = float('inf')
+    best_ratio = (1, 1)
+    area = width * height
+    for ratio in target_ratios:
+        target_aspect_ratio = ratio[0] / ratio[1]
+        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
+        if ratio_diff < best_ratio_diff:
+            best_ratio_diff = ratio_diff
+            best_ratio = ratio
+        elif ratio_diff == best_ratio_diff:
+            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
+                best_ratio = ratio
+    return best_ratio
+
+
+def _dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
+    orig_width, orig_height = image.size
+    aspect_ratio = orig_width / orig_height
+
+    # calculate the existing image aspect ratio
+    target_ratios = set((i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1)
+                        if i * j <= max_num and i * j >= min_num)
+    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
+
+    # find the closest aspect ratio to the target
+    target_aspect_ratio = _find_closest_aspect_ratio(aspect_ratio, target_ratios, orig_width, orig_height, image_size)
+
+    # calculate the target width and height
+    target_width = image_size * target_aspect_ratio[0]
+    target_height = image_size * target_aspect_ratio[1]
+    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
+
+    # resize the image
+    resized_img = image.resize((target_width, target_height))
+    processed_images = []
+    for i in range(blocks):
+        box = ((i % (target_width // image_size)) * image_size, (i // (target_width // image_size)) * image_size,
+               ((i % (target_width // image_size)) + 1) * image_size, ((i //
+                                                                        (target_width // image_size)) + 1) * image_size)
+        # split the image
+        split_img = resized_img.crop(box)
+        processed_images.append(split_img)
+    assert len(processed_images) == blocks
+    if use_thumbnail and len(processed_images) != 1:
+        thumbnail_img = image.resize((image_size, image_size))
+        processed_images.append(thumbnail_img)
+    return processed_images
+
+
+# <<< internvl
+
+
+def rescale_image(img: 'PIL.Image.Image', rescale_image: int = -1) -> 'PIL.Image.Image':
+    import torchvision.transforms as T
+    width = img.width
+    height = img.height
+    if rescale_image <= 0 or width * height <= rescale_image:
+        return img
+
+    ratio = width / height
+    height_scaled = math.pow(rescale_image / ratio, 0.5)
+    width_scaled = height_scaled * ratio
+    return T.Resize((int(width_scaled), int(height_scaled)))(img)
+
+
+_T = TypeVar('_T')
+
+
+def load_file(path: Union[str, _T]) -> Union[BytesIO, _T]:
+    res = path
+    if isinstance(path, str):
+        path = path.strip()
+        if path.startswith('http'):
+            request_kwargs = {}
+            timeout = float(os.getenv('TIMEOUT', '60'))
+            if timeout > 0:
+                request_kwargs['timeout'] = timeout
+            content = requests.get(path, **request_kwargs).content
+            res = BytesIO(content)
+        elif os.path.exists(path):
+            with open(path, 'rb') as f:
+                res = BytesIO(f.read())
+        else:  # base64_str
+            import binascii
+            try:
+                data = base64.b64decode(path)
+                res = BytesIO(data)
+            except (ValueError, binascii.Error) as error:
+                if len(path) < 200:
+                    raise ValueError(f'invalid image: "{path}"')
+                else:
+                    raise ValueError(f'invalid image: {error}')
+    return res
+
+
+def load_file_decorator(func):
+
+    def new_func(path, *args, **kwargs):
+        path = load_file(path)
+        res = func(path, *args, **kwargs)
+        return res
+
+    return new_func
+
+
+@load_file_decorator
+def load_image(image: Union['PIL.Image.Image', BytesIO]) -> 'PIL.Image.Image':
+    from PIL import Image
+    if isinstance(image, BytesIO):
+        image = Image.open(image)
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    return image
+
+
+def load_batch(path_list: List[Union[str, None, Any, BytesIO]],
+               load_func: Callable[[Any], _T] = load_image) -> List[_T]:
+    res = []
+    assert isinstance(path_list, (list, tuple)), f'path_list: {path_list}'
+    for path in path_list:
+        if path is None:  # ignore None
+            continue
+        res.append(load_func(path))
+    return res
+
+
+def _get_index(bound, fps, max_frame, first_idx=0, num_segments=32):
+    if bound:
+        start, end = bound[0], bound[1]
+    else:
+        start, end = -100000, 100000
+    start_idx = max(first_idx, round(start * fps))
+    end_idx = min(round(end * fps), max_frame)
+    seg_size = float(end_idx - start_idx) / num_segments
+    frame_indices = np.array(
+        [int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) for idx in range(num_segments)])
+    return frame_indices
+
+
+def transform_image(image, input_size=448, max_num=12):
+    transform = _build_transform(input_size=input_size)
+    images = _dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
+    pixel_values = [transform(image) for image in images]
+    pixel_values = torch.stack(pixel_values)
+    return pixel_values
+
+
+@load_file_decorator
+def load_video_internvl(video_io: BytesIO, bound=None, num_segments=32):
+    from decord import VideoReader, cpu
+    from PIL import Image
+    vr = VideoReader(video_io, ctx=cpu(0), num_threads=1)
+    max_frame = len(vr) - 1
+    fps = float(vr.get_avg_fps())
+
+    images = []
+    frame_indices = _get_index(bound, fps, max_frame, first_idx=0, num_segments=num_segments)
+    for frame_index in frame_indices:
+        images.append(Image.fromarray(vr[frame_index].asnumpy()).convert('RGB'))
+    return images
+
+
+def draw_plot(img_dir: str, bbox: List[int], bbox_type: str, output_file: str):
+    from PIL import Image, ImageDraw
+    from swift.llm.template.template import Template
+    image = Image.open(img_dir)
+
+    objects = [{'bbox': bbox, 'bbox_type': bbox_type, 'image': 0}]
+    Template.normalize_bbox(objects, [image], 'real')
+    bbox = objects[0]['bbox']
+    draw = ImageDraw.Draw(image)
+    draw.rectangle(bbox, outline='red', width=2)
+    image.save(output_file)
+
+
+@load_file_decorator
+def load_video_cogvlm2(video_io: BytesIO) -> np.ndarray:
+    from decord import cpu, VideoReader, bridge
+    bridge.set_bridge('torch')
+    clip_end_sec = 60
+    clip_start_sec = 0
+    num_frames = 24
+    decord_vr = VideoReader(video_io, ctx=cpu(0))
+    duration = len(decord_vr)  # duration in terms of frames
+    start_frame = int(clip_start_sec * decord_vr.get_avg_fps())
+    end_frame = min(duration, int(clip_end_sec * decord_vr.get_avg_fps())) if \
+        clip_end_sec is not None else duration
+    frame_id_list = np.linspace(start_frame, end_frame - 1, num_frames, dtype=int)
+    video_data = decord_vr.get_batch(frame_id_list)
+    video_data = video_data.permute(3, 0, 1, 2)
+    return video_data
+
+
+@load_file_decorator
+def load_video_llava(video_io: BytesIO) -> np.ndarray:
+    import av
+    container = av.open(video_io)
+    total_frames = container.streams.video[0].frames
+    indices = np.arange(0, total_frames, total_frames / 8).astype(int)
+    frames = []
+    container.seek(0)
+    start_index = indices[0]
+    end_index = indices[-1]
+    for i, frame in enumerate(container.decode(video=0)):
+        if i > end_index:
+            break
+        if i >= start_index and i in indices:
+            frames.append(frame)
+    return np.stack([x.to_ndarray(format='rgb24') for x in frames])
+
+
+@load_file_decorator
+def load_video_minicpmv_mplug_owl3(video_io: BytesIO, max_num_frames):
+    from PIL import Image
+    from decord import VideoReader, cpu  # pip install decord
+
+    def uniform_sample(_l, _n):
+        gap = len(_l) / _n
+        idxs = [int(i * gap + gap / 2) for i in range(_n)]
+        return [_l[i] for i in idxs]
+
+    vr = VideoReader(video_io, ctx=cpu(0))
+    sample_fps = round(vr.get_avg_fps() / 1)  # FPS
+    frame_idx = [i for i in range(0, len(vr), sample_fps)]
+
+    if len(frame_idx) > max_num_frames:
+        frame_idx = uniform_sample(frame_idx, max_num_frames)
+    frames = vr.get_batch(frame_idx).asnumpy()
+    frames = [Image.fromarray(v.astype('uint8')) for v in frames]
+    return frames
+
+
+@load_file_decorator
+def load_audio_qwen(audio_io: BytesIO, sampling_rate: int):
+    import librosa
+    return librosa.load(audio_io, sr=sampling_rate)[0]
+
+
+def load_video_qwen2(video_path: str):
+    from swift.llm.template.template import get_env_args
+    import torchvision
+    from torchvision import io, transforms
+    from qwen_vl_utils.vision_process import (round_by_factor, FPS, FRAME_FACTOR, FPS_MIN_FRAMES, FPS_MAX_FRAMES,
+                                              VIDEO_MIN_PIXELS, VIDEO_MAX_PIXELS, VIDEO_TOTAL_PIXELS, smart_resize,
+                                              ceil_by_factor, floor_by_factor)
+    from torchvision.transforms import InterpolationMode
+
+    if version.parse(torchvision.__version__) >= version.parse('0.19'):
+        video_path = load_file(video_path)
+    video, _, info = io.read_video(
+        video_path,
+        pts_unit='sec',
+        output_format='TCHW',
+    )
+    nframes = get_env_args('nframes', int, None)
+    fps = get_env_args('fps', int, None)
+    size_factor = get_env_args('size_factor', int, FRAME_FACTOR)
+    assert not (fps and nframes), 'Only accept either `fps` or `nframes`'
+    if nframes is not None:
+        nframes = round_by_factor(nframes, size_factor)
+    else:
+        fps = FPS
+        nframes = video.size(0) / info['video_fps'] * fps
+        nframes = round_by_factor(nframes, size_factor)
+        min_frames = get_env_args('min_frames', int, FPS_MIN_FRAMES)
+        max_frames = get_env_args('max_frames', int, FPS_MAX_FRAMES)
+        if nframes < min_frames:
+            nframes = ceil_by_factor(min_frames, size_factor)
+        if nframes > max_frames:
+            nframes = floor_by_factor(max_frames, size_factor)
+
+    if not (size_factor <= nframes and nframes <= video.size(0)):
+        raise ValueError(f'nframes should in interval [{size_factor}, {video.size(0)}], but got {nframes}.')
+
+    idx = torch.linspace(0, video.size(0) - 1, nframes).round().long()
+    height, width = video.shape[2:]
+    video = video[idx]
+
+    min_pixels = get_env_args('min_pixels', int, VIDEO_MIN_PIXELS)
+    total_pixels = get_env_args('total_pixels', int, VIDEO_TOTAL_PIXELS)
+    max_pixels = get_env_args('max_pixels', int, None)
+    if max_pixels is None:
+        max_pixels = VIDEO_MAX_PIXELS
+        max_pixels = max(min(max_pixels, total_pixels / nframes * size_factor), min_pixels * 1.05)
+    # resize
+    resized_height = get_env_args('resized_height', int, None)
+    resized_width = get_env_args('resized_width', int, None)
+    if resized_height and resized_width:
+        resized_height, resized_width = smart_resize(
+            resized_height,
+            resized_width,
+            factor=size_factor,
+        )
+    else:
+        resized_height, resized_width = smart_resize(
+            height,
+            width,
+            factor=size_factor,
+            min_pixels=min_pixels,
+            max_pixels=max_pixels,
+        )
+
+    video = transforms.functional.resize(
+        video,
+        [resized_height, resized_width],
+        interpolation=InterpolationMode.BICUBIC,
+        antialias=True,
+    ).float()
+    return video
+
+
+if __name__ == '__main__':
+    # A test main to draw bbox
+    draw_plot('man.jpg', [354, 462, 580, 738], 'norm_1000', 'man_bbox.jpg')
diff --git a/tests/tools/__init__.py b/tests/tools/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/tests/tools/test_to_ollama.py b/tests/tools/test_to_ollama.py
new file mode 100644
index 00000000..aaf5f4d0
--- /dev/null
+++ b/tests/tools/test_to_ollama.py
@@ -0,0 +1,106 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import unittest
+
+from modelscope.preprocessors.templates import TemplateType
+from modelscope.preprocessors.templates.loader import TemplateLoader
+from modelscope.utils.test_utils import test_level
+
+
+class TestToOllama(unittest.TestCase):
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_load_template(self):
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/Meta-Llama-3-8B-Instruct')
+        self.assertTrue(template.template_type == TemplateType.llama3)
+
+        template = TemplateLoader.load_by_model_id(
+            'swift/Meta-Llama-3-70B-Instruct-AWQ')
+        self.assertTrue(template.template_type == TemplateType.llama3)
+
+        template = TemplateLoader.load_by_model_id(
+            'deepseek-ai/DeepSeek-V2-Lite-Chat')
+        self.assertTrue(template.template_type == TemplateType.deepseek2)
+
+        template = TemplateLoader.load_by_model_id('deepseek-ai/DeepSeek-V2.5')
+        self.assertTrue(template.template_type == TemplateType.deepseek2_5)
+
+        template = TemplateLoader.load_by_model_id(
+            'deepseek-ai/deepseek-coder-1.3b-instruct')
+        self.assertTrue(template.template_type == TemplateType.deepseek_coder)
+
+        template = TemplateLoader.load_by_model_id(
+            'OpenBuddy/openbuddy-deepseek-67b-v15.2')
+        self.assertTrue(template is None)
+
+        template = TemplateLoader.load_by_model_id(
+            'deepseek-ai/deepseek-llm-67b-chat')
+        self.assertTrue(template.template_type == TemplateType.deepseek)
+
+        template = TemplateLoader.load_by_model_id(
+            'deepseek-ai/DeepSeek-Coder-V2-Instruct')
+        self.assertTrue(template.template_type == TemplateType.deepseek2)
+
+        template = TemplateLoader.load_by_model_id('01ai/Yi-1.5-9B-Chat')
+        self.assertTrue(template.template_type == TemplateType.chatml)
+
+        template = TemplateLoader.load_by_model_id('01ai/Yi-Coder-9B-Chat')
+        self.assertTrue(template.template_type == TemplateType.yi_coder)
+
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/gemma-2-27b-it')
+        self.assertTrue(template.template_type == TemplateType.gemma)
+
+        template = TemplateLoader.load_by_model_id('AI-ModelScope/gemma-2b')
+        self.assertTrue(template is None)
+
+        template = TemplateLoader.load_by_model_id(
+            'AI-ModelScope/gemma-2b-instruct')
+        self.assertTrue(template is None)
+
+        template = TemplateLoader.load_by_model_id(
+            'AI-ModelScope/gemma2-2b-instruct')
+        self.assertTrue(template.template_type == TemplateType.gemma)
+
+        template = TemplateLoader.load_by_model_id(
+            'AI-ModelScope/paligemma-3b-mix-224')
+        self.assertTrue(template is None)
+
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/Phi-3-vision-128k-instruct')
+        self.assertTrue(template is None)
+
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/Phi-3-128k-instruct')
+        self.assertTrue(template.template_type == TemplateType.phi3)
+
+        template = TemplateLoader.load_by_model_id(
+            'LLM-Research/Phi-3-128k-instruct-GGUF')
+        self.assertTrue(template.template_type == TemplateType.phi3)
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_load_ollama(self):
+        ollama = TemplateLoader.to_ollama(
+            'LLM-Research/Meta-Llama-3.1-8B-Instruct-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'QuantFactory/Gemma-2-Ataraxy-9B-Chat-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama('Xorbits/Llama-2-7b-Chat-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'AI-ModelScope/gemma2-2b-instruct-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'LLM-Research/Phi-3-128k-instruct-GGUF')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(template_name='phi3')
+        self.assertTrue(ollama is not None)
+        ollama = TemplateLoader.to_ollama(
+            'QuantFactory/Mistral-Nemo-Japanese-Instruct-2408-GGUF')
+        self.assertTrue(ollama is not None)
+
+
+if __name__ == '__main__':
+    unittest.main()