[to #42322933]add fine-tune code for referring video object segmentation

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/10539423
2025-12-25 04:29:22 +01:00 · 2022-10-27 19:43:54 +08:00
parent 3b75623be4
commit ddcb57440d
21 changed files with 1414 additions and 19 deletions
--- a/modelscope/metainfo.py
+++ b/modelscope/metainfo.py
@@ -305,6 +305,7 @@ class Trainers(object):
    face_detection_scrfd = 'face-detection-scrfd'
    card_detection_scrfd = 'card-detection-scrfd'
    image_inpainting = 'image-inpainting'
+    referring_video_object_segmentation = 'referring-video-object-segmentation'
    image_classification_team = 'image-classification-team'

    # nlp trainers
@@ -423,6 +424,8 @@ class Metrics(object):
    image_inpainting_metric = 'image-inpainting-metric'
    # metric for ocr
    NED = 'ned'
+    # metric for referring-video-object-segmentation task
+    referring_video_object_segmentation_metric = 'referring-video-object-segmentation-metric'


 class Optimizers(object):
--- a/modelscope/metrics/init.py
+++ b/modelscope/metrics/init.py
@@ -20,6 +20,7 @@ if TYPE_CHECKING:
    from .accuracy_metric import AccuracyMetric
    from .bleu_metric import BleuMetric
    from .image_inpainting_metric import ImageInpaintingMetric
+    from .referring_video_object_segmentation_metric import ReferringVideoObjectSegmentationMetric

 else:
    _import_structure = {
@@ -40,6 +41,8 @@ else:
        'image_inpainting_metric': ['ImageInpaintingMetric'],
        'accuracy_metric': ['AccuracyMetric'],
        'bleu_metric': ['BleuMetric'],
+        'referring_video_object_segmentation_metric':
+        ['ReferringVideoObjectSegmentationMetric'],
    }

    import sys
--- a/modelscope/metrics/builder.py
+++ b/modelscope/metrics/builder.py
@@ -43,6 +43,8 @@ task_default_metrics = {
    Tasks.visual_question_answering: [Metrics.text_gen_metric],
    Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
    Tasks.image_inpainting: [Metrics.image_inpainting_metric],
+    Tasks.referring_video_object_segmentation:
+    [Metrics.referring_video_object_segmentation_metric],
 }


--- a/modelscope/metrics/referring_video_object_segmentation_metric.py
+++ b/modelscope/metrics/referring_video_object_segmentation_metric.py
@@ -0,0 +1,108 @@
+# Part of the implementation is borrowed and modified from MTTR,
+# publicly available at https://github.com/mttr2021/MTTR
+from typing import Dict
+
+import numpy as np
+import torch
+from pycocotools.coco import COCO
+from pycocotools.cocoeval import COCOeval
+from pycocotools.mask import decode
+from tqdm import tqdm
+
+from modelscope.metainfo import Metrics
+from modelscope.utils.registry import default_group
+from .base import Metric
+from .builder import METRICS, MetricKeys
+
+
+@METRICS.register_module(
+    group_key=default_group,
+    module_name=Metrics.referring_video_object_segmentation_metric)
+class ReferringVideoObjectSegmentationMetric(Metric):
+    """The metric computation class for movie scene segmentation classes.
+    """
+
+    def __init__(self,
+                 ann_file=None,
+                 calculate_precision_and_iou_metrics=True):
+        self.ann_file = ann_file
+        self.calculate_precision_and_iou_metrics = calculate_precision_and_iou_metrics
+        self.preds = []
+
+    def add(self, outputs: Dict, inputs: Dict):
+        preds_batch = outputs['pred']
+        self.preds.extend(preds_batch)
+
+    def evaluate(self):
+        coco_gt = COCO(self.ann_file)
+        coco_pred = coco_gt.loadRes(self.preds)
+        coco_eval = COCOeval(coco_gt, coco_pred, iouType='segm')
+        coco_eval.params.useCats = 0
+
+        coco_eval.evaluate()
+        coco_eval.accumulate()
+        coco_eval.summarize()
+
+        ap_labels = [
+            'mAP 0.5:0.95', 'AP 0.5', 'AP 0.75', 'AP 0.5:0.95 S',
+            'AP 0.5:0.95 M', 'AP 0.5:0.95 L'
+        ]
+        ap_metrics = coco_eval.stats[:6]
+        eval_metrics = {la: m for la, m in zip(ap_labels, ap_metrics)}
+        if self.calculate_precision_and_iou_metrics:
+            precision_at_k, overall_iou, mean_iou = calculate_precision_at_k_and_iou_metrics(
+                coco_gt, coco_pred)
+            eval_metrics.update({
+                f'P@{k}': m
+                for k, m in zip([0.5, 0.6, 0.7, 0.8, 0.9], precision_at_k)
+            })
+            eval_metrics.update({
+                'overall_iou': overall_iou,
+                'mean_iou': mean_iou
+            })
+
+        return eval_metrics
+
+
+def compute_iou(outputs: torch.Tensor, labels: torch.Tensor, EPS=1e-6):
+    outputs = outputs.int()
+    intersection = (outputs & labels).float().sum(
+        (1, 2))  # Will be zero if Truth=0 or Prediction=0
+    union = (outputs | labels).float().sum(
+        (1, 2))  # Will be zero if both are 0
+    iou = (intersection + EPS) / (union + EPS
+                                  )  # EPS is used to avoid division by zero
+    return iou, intersection, union
+
+
+def calculate_precision_at_k_and_iou_metrics(coco_gt: COCO, coco_pred: COCO):
+    print('evaluating precision@k & iou metrics...')
+    counters_by_iou = {iou: 0 for iou in [0.5, 0.6, 0.7, 0.8, 0.9]}
+    total_intersection_area = 0
+    total_union_area = 0
+    ious_list = []
+    for instance in tqdm(coco_gt.imgs.keys()
+                         ):  # each image_id contains exactly one instance
+        gt_annot = coco_gt.imgToAnns[instance][0]
+        gt_mask = decode(gt_annot['segmentation'])
+        pred_annots = coco_pred.imgToAnns[instance]
+        pred_annot = sorted(
+            pred_annots,
+            key=lambda a: a['score'])[-1]  # choose pred with highest score
+        pred_mask = decode(pred_annot['segmentation'])
+        iou, intersection, union = compute_iou(
+            torch.tensor(pred_mask).unsqueeze(0),
+            torch.tensor(gt_mask).unsqueeze(0))
+        iou, intersection, union = iou.item(), intersection.item(), union.item(
+        )
+        for iou_threshold in counters_by_iou.keys():
+            if iou > iou_threshold:
+                counters_by_iou[iou_threshold] += 1
+        total_intersection_area += intersection
+        total_union_area += union
+        ious_list.append(iou)
+    num_samples = len(ious_list)
+    precision_at_k = np.array(list(counters_by_iou.values())) / num_samples
+    overall_iou = total_intersection_area / total_union_area
+    mean_iou = np.mean(ious_list)
+    return precision_at_k, overall_iou, mean_iou
--- a/modelscope/models/cv/referring_video_object_segmentation/init.py
+++ b/modelscope/models/cv/referring_video_object_segmentation/init.py
@@ -5,11 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule

 if TYPE_CHECKING:

-    from .model import MovieSceneSegmentation
+    from .model import ReferringVideoObjectSegmentation

 else:
    _import_structure = {
-        'model': ['MovieSceneSegmentation'],
+        'model': ['ReferringVideoObjectSegmentation'],
    }

    import sys
--- a/modelscope/models/cv/referring_video_object_segmentation/model.py
+++ b/modelscope/models/cv/referring_video_object_segmentation/model.py
@@ -1,4 +1,6 @@
-# Copyright (c) Alibaba, Inc. and its affiliates.
+# Part of the implementation is borrowed and modified from MTTR,
+# publicly available at https://github.com/mttr2021/MTTR
+
 import os.path as osp
 from typing import Any, Dict

@@ -10,7 +12,9 @@ from modelscope.models.builder import MODELS
 from modelscope.utils.config import Config
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.logger import get_logger
-from .utils import (MTTR, A2DSentencesPostProcess, ReferYoutubeVOSPostProcess,
+from .utils import (MTTR, A2DSentencesPostProcess, HungarianMatcher,
+                    ReferYoutubeVOSPostProcess, SetCriterion,
+                    flatten_temporal_batch_dims,
                    nested_tensor_from_videos_list)

 logger = get_logger()
@@ -35,16 +39,66 @@ class ReferringVideoObjectSegmentation(TorchModel):
            params_dict = params_dict['model_state_dict']
        self.model.load_state_dict(params_dict, strict=True)

-        dataset_name = self.cfg.pipeline.dataset_name
-        if dataset_name == 'a2d_sentences' or dataset_name == 'jhmdb_sentences':
-            self.postprocessor = A2DSentencesPostProcess()
-        elif dataset_name == 'ref_youtube_vos':
-            self.postprocessor = ReferYoutubeVOSPostProcess()
+        self.set_postprocessor(self.cfg.pipeline.dataset_name)
+        self.set_criterion()
+
+    def set_device(self, device, name):
+        self.device = device
+        self._device_name = name
+
+    def set_postprocessor(self, dataset_name):
+        if 'a2d_sentences' in dataset_name or 'jhmdb_sentences' in dataset_name:
+            self.postprocessor = A2DSentencesPostProcess()  # fine-tune
+        elif 'ref_youtube_vos' in dataset_name:
+            self.postprocessor = ReferYoutubeVOSPostProcess()  # inference
        else:
            assert False, f'postprocessing for dataset: {dataset_name} is not supported'

-    def forward(self, inputs: Dict[str, Any]) -> Dict[str, torch.Tensor]:
-        return inputs
+    def forward(self, inputs: Dict[str, Any]):
+        samples = inputs['samples']
+        targets = inputs['targets']
+        text_queries = inputs['text_queries']
+
+        valid_indices = torch.tensor(
+            [i for i, t in enumerate(targets) if None not in t])
+        targets = [targets[i] for i in valid_indices.tolist()]
+        if self._device_name == 'gpu':
+            samples = samples.to(self.device)
+            valid_indices = valid_indices.to(self.device)
+        if isinstance(text_queries, tuple):
+            text_queries = list(text_queries)
+
+        outputs = self.model(samples, valid_indices, text_queries)
+        losses = -1
+        if self.training:
+            loss_dict = self.criterion(outputs, targets)
+            weight_dict = self.criterion.weight_dict
+            losses = sum(loss_dict[k] * weight_dict[k]
+                         for k in loss_dict.keys() if k in weight_dict)
+
+        predictions = []
+        if not self.training:
+            outputs.pop('aux_outputs', None)
+            outputs, targets = flatten_temporal_batch_dims(outputs, targets)
+            processed_outputs = self.postprocessor(
+                outputs,
+                resized_padded_sample_size=samples.tensors.shape[-2:],
+                resized_sample_sizes=[t['size'] for t in targets],
+                orig_sample_sizes=[t['orig_size'] for t in targets])
+            image_ids = [t['image_id'] for t in targets]
+            predictions = []
+            for p, image_id in zip(processed_outputs, image_ids):
+                for s, m in zip(p['scores'], p['rle_masks']):
+                    predictions.append({
+                        'image_id': image_id,
+                        'category_id':
+                        1,  # dummy label, as categories are not predicted in ref-vos
+                        'segmentation': m,
+                        'score': s.item()
+                    })
+
+        re = dict(pred=predictions, loss=losses)
+        return re

    def inference(self, **kwargs):
        window = kwargs['window']
@@ -63,3 +117,26 @@ class ReferringVideoObjectSegmentation(TorchModel):

    def postprocess(self, inputs: Dict[str, Any], **kwargs):
        return inputs
+
+    def set_criterion(self):
+        matcher = HungarianMatcher(
+            cost_is_referred=self.cfg.matcher.set_cost_is_referred,
+            cost_dice=self.cfg.matcher.set_cost_dice)
+        weight_dict = {
+            'loss_is_referred': self.cfg.loss.is_referred_loss_coef,
+            'loss_dice': self.cfg.loss.dice_loss_coef,
+            'loss_sigmoid_focal': self.cfg.loss.sigmoid_focal_loss_coef
+        }
+
+        if self.cfg.loss.aux_loss:
+            aux_weight_dict = {}
+            for i in range(self.cfg.model.num_decoder_layers - 1):
+                aux_weight_dict.update(
+                    {k + f'_{i}': v
+                     for k, v in weight_dict.items()})
+            weight_dict.update(aux_weight_dict)
+
+        self.criterion = SetCriterion(
+            matcher=matcher,
+            weight_dict=weight_dict,
+            eos_coef=self.cfg.loss.eos_coef)
--- a/modelscope/models/cv/referring_video_object_segmentation/utils/init.py
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/init.py
@@ -1,4 +1,6 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
-from .misc import nested_tensor_from_videos_list
+from .criterion import SetCriterion, flatten_temporal_batch_dims
+from .matcher import HungarianMatcher
+from .misc import interpolate, nested_tensor_from_videos_list
 from .mttr import MTTR
 from .postprocessing import A2DSentencesPostProcess, ReferYoutubeVOSPostProcess
--- a/modelscope/models/cv/referring_video_object_segmentation/utils/criterion.py
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/criterion.py
@@ -0,0 +1,198 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from DETR https://github.com/facebookresearch/detr
+import torch
+from torch import nn
+
+from .misc import (get_world_size, interpolate, is_dist_avail_and_initialized,
+                   nested_tensor_from_tensor_list)
+from .segmentation import dice_loss, sigmoid_focal_loss
+
+
+class SetCriterion(nn.Module):
+    """ This class computes the loss for MTTR.
+    The process happens in two steps:
+        1) we compute the hungarian assignment between the ground-truth and predicted sequences.
+        2) we supervise each pair of matched ground-truth / prediction sequences (mask + reference prediction)
+    """
+
+    def __init__(self, matcher, weight_dict, eos_coef):
+        """ Create the criterion.
+        Parameters:
+            matcher: module able to compute a matching between targets and proposals
+            weight_dict: dict containing as key the names of the losses and as values their relative weight.
+            eos_coef: relative classification weight applied to the un-referred category
+        """
+        super().__init__()
+        self.matcher = matcher
+        self.weight_dict = weight_dict
+        self.eos_coef = eos_coef
+        # make sure that only loss functions with non-zero weights are computed:
+        losses_to_compute = []
+        if weight_dict['loss_dice'] > 0 or weight_dict[
+                'loss_sigmoid_focal'] > 0:
+            losses_to_compute.append('masks')
+        if weight_dict['loss_is_referred'] > 0:
+            losses_to_compute.append('is_referred')
+        self.losses = losses_to_compute
+
+    def forward(self, outputs, targets):
+        aux_outputs_list = outputs.pop('aux_outputs', None)
+        # compute the losses for the output of the last decoder layer:
+        losses = self.compute_criterion(
+            outputs, targets, losses_to_compute=self.losses)
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate decoder layer.
+        if aux_outputs_list is not None:
+            aux_losses_to_compute = self.losses.copy()
+            for i, aux_outputs in enumerate(aux_outputs_list):
+                losses_dict = self.compute_criterion(aux_outputs, targets,
+                                                     aux_losses_to_compute)
+                losses_dict = {k + f'_{i}': v for k, v in losses_dict.items()}
+                losses.update(losses_dict)
+
+        return losses
+
+    def compute_criterion(self, outputs, targets, losses_to_compute):
+        # Retrieve the matching between the outputs of the last layer and the targets
+        indices = self.matcher(outputs, targets)
+
+        # T & B dims are flattened so loss functions can be computed per frame (but with same indices per video).
+        # also, indices are repeated so so the same indices can be used for frames of the same video.
+        T = len(targets)
+        outputs, targets = flatten_temporal_batch_dims(outputs, targets)
+        # repeat the indices list T times so the same indices can be used for each video frame
+        indices = T * indices
+
+        # Compute the average number of target masks across all nodes, for normalization purposes
+        num_masks = sum(len(t['masks']) for t in targets)
+        num_masks = torch.as_tensor([num_masks],
+                                    dtype=torch.float,
+                                    device=indices[0][0].device)
+        if is_dist_avail_and_initialized():
+            torch.distributed.all_reduce(num_masks)
+        num_masks = torch.clamp(num_masks / get_world_size(), min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in losses_to_compute:
+            losses.update(
+                self.get_loss(
+                    loss, outputs, targets, indices, num_masks=num_masks))
+        return losses
+
+    def loss_is_referred(self, outputs, targets, indices, **kwargs):
+        device = outputs['pred_is_referred'].device
+        bs = outputs['pred_is_referred'].shape[0]
+        pred_is_referred = outputs['pred_is_referred'].log_softmax(
+            dim=-1)  # note that log-softmax is used here
+        target_is_referred = torch.zeros_like(pred_is_referred)
+        # extract indices of object queries that where matched with text-referred target objects
+        query_referred_indices = self._get_query_referred_indices(
+            indices, targets)
+        # by default penalize compared to the no-object class (last token)
+        target_is_referred[:, :, :] = torch.tensor([0.0, 1.0], device=device)
+        if 'is_ref_inst_visible' in targets[
+                0]:  # visibility labels are available per-frame for the referred object:
+            is_ref_inst_visible_per_frame = torch.stack(
+                [t['is_ref_inst_visible'] for t in targets])
+            ref_inst_visible_frame_indices = is_ref_inst_visible_per_frame.nonzero(
+            ).squeeze()
+            # keep only the matched query indices of the frames in which the referred object is visible:
+            visible_query_referred_indices = query_referred_indices[
+                ref_inst_visible_frame_indices]
+            target_is_referred[ref_inst_visible_frame_indices,
+                               visible_query_referred_indices] = torch.tensor(
+                                   [1.0, 0.0], device=device)
+        else:  # assume that the referred object is visible in every frame:
+            target_is_referred[torch.arange(bs),
+                               query_referred_indices] = torch.tensor(
+                                   [1.0, 0.0], device=device)
+        loss = -(pred_is_referred * target_is_referred).sum(-1)
+        # apply no-object class weights:
+        eos_coef = torch.full(loss.shape, self.eos_coef, device=loss.device)
+        eos_coef[torch.arange(bs), query_referred_indices] = 1.0
+        loss = loss * eos_coef
+        bs = len(indices)
+        loss = loss.sum() / bs  # sum and normalize the loss by the batch size
+        losses = {'loss_is_referred': loss}
+        return losses
+
+    def loss_masks(self, outputs, targets, indices, num_masks, **kwargs):
+        assert 'pred_masks' in outputs
+
+        src_idx = self._get_src_permutation_idx(indices)
+        tgt_idx = self._get_tgt_permutation_idx(indices)
+        src_masks = outputs['pred_masks']
+        src_masks = src_masks[src_idx]
+        masks = [t['masks'] for t in targets]
+        target_masks, valid = nested_tensor_from_tensor_list(masks).decompose()
+        target_masks = target_masks.to(src_masks)
+        target_masks = target_masks[tgt_idx]
+
+        # upsample predictions to the target size
+        src_masks = interpolate(
+            src_masks[:, None],
+            size=target_masks.shape[-2:],
+            mode='bilinear',
+            align_corners=False)
+        src_masks = src_masks[:, 0].flatten(1)
+
+        target_masks = target_masks.flatten(1)
+        target_masks = target_masks.view(src_masks.shape)
+        losses = {
+            'loss_sigmoid_focal':
+            sigmoid_focal_loss(src_masks, target_masks, num_masks),
+            'loss_dice':
+            dice_loss(src_masks, target_masks, num_masks),
+        }
+        return losses
+
+    @staticmethod
+    def _get_src_permutation_idx(indices):
+        # permute predictions following indices
+        batch_idx = torch.cat(
+            [torch.full_like(src, i) for i, (src, _) in enumerate(indices)])
+        src_idx = torch.cat([src for (src, _) in indices])
+        return batch_idx, src_idx
+
+    @staticmethod
+    def _get_tgt_permutation_idx(indices):
+        # permute targets following indices
+        batch_idx = torch.cat(
+            [torch.full_like(tgt, i) for i, (_, tgt) in enumerate(indices)])
+        tgt_idx = torch.cat([tgt for (_, tgt) in indices])
+        return batch_idx, tgt_idx
+
+    @staticmethod
+    def _get_query_referred_indices(indices, targets):
+        """
+        extract indices of object queries that where matched with text-referred target objects
+        """
+        query_referred_indices = []
+        for (query_idxs, target_idxs), target in zip(indices, targets):
+            ref_query_idx = query_idxs[torch.where(
+                target_idxs == target['referred_instance_idx'])[0]]
+            query_referred_indices.append(ref_query_idx)
+        query_referred_indices = torch.cat(query_referred_indices)
+        return query_referred_indices
+
+    def get_loss(self, loss, outputs, targets, indices, **kwargs):
+        loss_map = {
+            'masks': self.loss_masks,
+            'is_referred': self.loss_is_referred,
+        }
+        assert loss in loss_map, f'do you really want to compute {loss} loss?'
+        return loss_map[loss](outputs, targets, indices, **kwargs)
+
+
+def flatten_temporal_batch_dims(outputs, targets):
+    for k in outputs.keys():
+        if isinstance(outputs[k], torch.Tensor):
+            outputs[k] = outputs[k].flatten(0, 1)
+        else:  # list
+            outputs[k] = [i for step_t in outputs[k] for i in step_t]
+    targets = [
+        frame_t_target for step_t in targets for frame_t_target in step_t
+    ]
+    return outputs, targets
--- a/modelscope/models/cv/referring_video_object_segmentation/utils/matcher.py
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/matcher.py
@@ -0,0 +1,163 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from DETR https://github.com/facebookresearch/detr
+# Module to compute the matching cost and solve the corresponding LSAP.
+
+import torch
+from scipy.optimize import linear_sum_assignment
+from torch import nn
+
+from .misc import interpolate, nested_tensor_from_tensor_list
+
+
+class HungarianMatcher(nn.Module):
+    """This class computes an assignment between the targets and the predictions of the network
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general,
+    there are more predictions than targets. In this case, we do a 1-to-1 matching of the best predictions,
+    while the others are un-matched (and thus treated as non-objects).
+    """
+
+    def __init__(self, cost_is_referred: float = 1, cost_dice: float = 1):
+        """Creates the matcher
+
+        Params:
+            cost_is_referred: This is the relative weight of the reference cost in the total matching cost
+            cost_dice: This is the relative weight of the dice cost in the total matching cost
+        """
+        super().__init__()
+        self.cost_is_referred = cost_is_referred
+        self.cost_dice = cost_dice
+        assert cost_is_referred != 0 or cost_dice != 0, 'all costs cant be 0'
+
+    @torch.inference_mode()
+    def forward(self, outputs, targets):
+        """ Performs the matching
+
+        Params:
+            outputs: A dict that contains at least these entries:
+                 "pred_is_referred": Tensor of dim [time, batch_size, num_queries, 2] with the reference logits
+                 "pred_masks": Tensor of dim [time, batch_size, num_queries, H, W] with the predicted masks logits
+
+            targets: A list of lists of targets (outer - time steps, inner - batch samples). each target is a dict
+                     which contain mask and reference ground truth information for a single frame.
+
+        Returns:
+            A list of size batch_size, containing tuples of (index_i, index_j) where:
+                - index_i is the indices of the selected predictions (in order)
+                - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds:
+                len(index_i) = len(index_j) = min(num_queries, num_target_masks)
+        """
+        t, bs, num_queries = outputs['pred_masks'].shape[:3]
+
+        # We flatten to compute the cost matrices in a batch
+        out_masks = outputs['pred_masks'].flatten(
+            1, 2)  # [t, batch_size * num_queries, mask_h, mask_w]
+
+        # preprocess and concat the target masks
+        tgt_masks = [[
+            m for v in t_step_batch for m in v['masks'].unsqueeze(1)
+        ] for t_step_batch in targets]
+        # pad the target masks to a uniform shape
+        tgt_masks, valid = list(
+            zip(*[
+                nested_tensor_from_tensor_list(t).decompose()
+                for t in tgt_masks
+            ]))
+        tgt_masks = torch.stack(tgt_masks).squeeze(2)
+
+        # upsample predicted masks to target mask size
+        out_masks = interpolate(
+            out_masks,
+            size=tgt_masks.shape[-2:],
+            mode='bilinear',
+            align_corners=False)
+
+        # Compute the soft-tokens cost:
+        if self.cost_is_referred > 0:
+            cost_is_referred = compute_is_referred_cost(outputs, targets)
+        else:
+            cost_is_referred = 0
+
+        # Compute the DICE coefficient between the masks:
+        if self.cost_dice > 0:
+            cost_dice = -dice_coef(out_masks, tgt_masks)
+        else:
+            cost_dice = 0
+
+        # Final cost matrix
+        C = self.cost_is_referred * cost_is_referred + self.cost_dice * cost_dice
+        C = C.view(bs, num_queries, -1).cpu()
+
+        num_traj_per_batch = [
+            len(v['masks']) for v in targets[0]
+        ]  # number of instance trajectories in each batch
+        indices = [
+            linear_sum_assignment(c[i])
+            for i, c in enumerate(C.split(num_traj_per_batch, -1))
+        ]
+        device = out_masks.device
+        return [(torch.as_tensor(i, dtype=torch.int64, device=device),
+                 torch.as_tensor(j, dtype=torch.int64, device=device))
+                for i, j in indices]
+
+
+def dice_coef(inputs, targets, smooth=1.0):
+    """
+    Compute the DICE coefficient, similar to generalized IOU for masks
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs
+                (0 for the negative class and 1 for the positive class).
+    """
+    inputs = inputs.sigmoid().flatten(2).unsqueeze(2)
+    targets = targets.flatten(2).unsqueeze(1)
+    numerator = 2 * (inputs * targets).sum(-1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    coef = (numerator + smooth) / (denominator + smooth)
+    coef = coef.mean(
+        0)  # average on the temporal dim to get instance trajectory scores
+    return coef
+
+
+def compute_is_referred_cost(outputs, targets):
+    pred_is_referred = outputs['pred_is_referred'].flatten(1, 2).softmax(
+        dim=-1)  # [t, b*nq, 2]
+    device = pred_is_referred.device
+    t = pred_is_referred.shape[0]
+    # number of instance trajectories in each batch
+    num_traj_per_batch = torch.tensor([len(v['masks']) for v in targets[0]],
+                                      device=device)
+    total_trajectories = num_traj_per_batch.sum()
+    # note that ref_indices are shared across time steps:
+    ref_indices = torch.tensor(
+        [v['referred_instance_idx'] for v in targets[0]], device=device)
+    # convert ref_indices to fit flattened batch targets:
+    ref_indices += torch.cat(
+        (torch.zeros(1, dtype=torch.long,
+                     device=device), num_traj_per_batch.cumsum(0)[:-1]))
+    # number of instance trajectories in each batch
+    target_is_referred = torch.zeros((t, total_trajectories, 2), device=device)
+    # 'no object' class by default (for un-referred objects)
+    target_is_referred[:, :, :] = torch.tensor([0.0, 1.0], device=device)
+    if 'is_ref_inst_visible' in targets[0][
+            0]:  # visibility labels are available per-frame for the referred object:
+        is_ref_inst_visible = torch.stack([
+            torch.stack([t['is_ref_inst_visible'] for t in t_step])
+            for t_step in targets
+        ]).permute(1, 0)
+        for ref_idx, is_visible in zip(ref_indices, is_ref_inst_visible):
+            is_visible = is_visible.nonzero().squeeze()
+            target_is_referred[is_visible,
+                               ref_idx, :] = torch.tensor([1.0, 0.0],
+                                                          device=device)
+    else:  # assume that the referred object is visible in every frame:
+        target_is_referred[:, ref_indices, :] = torch.tensor([1.0, 0.0],
+                                                             device=device)
+    cost_is_referred = -(pred_is_referred.unsqueeze(2)
+                         * target_is_referred.unsqueeze(1)).sum(dim=-1).mean(
+                             dim=0)
+    return cost_is_referred
--- a/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/multimodal_transformer.py
@@ -122,8 +122,8 @@ class MultimodalTransformer(nn.Module):
        with torch.inference_mode(mode=self.freeze_text_encoder):
            encoded_text = self.text_encoder(**tokenized_queries)
        # Transpose memory because pytorch's attention expects sequence first
-        txt_memory = rearrange(encoded_text.last_hidden_state,
-                               'b s c -> s b c')
+        tmp_last_hidden_state = encoded_text.last_hidden_state.clone()
+        txt_memory = rearrange(tmp_last_hidden_state, 'b s c -> s b c')
        txt_memory = self.txt_proj(
            txt_memory)  # change text embeddings dim to model dim
        # Invert attention mask that we get from huggingface because its the opposite in pytorch transformer
--- a/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py
+++ b/modelscope/models/cv/referring_video_object_segmentation/utils/swin_transformer.py
@@ -123,7 +123,8 @@ class WindowAttention3D(nn.Module):
        # define a parameter table of relative position bias
        wd, wh, ww = window_size
        self.relative_position_bias_table = nn.Parameter(
-            torch.zeros((2 * wd - 1) * (2 * wh - 1) * (2 * ww - 1), num_heads))
+            torch.zeros((2 * wd - 1) * (2 * wh - 1) * (2 * ww - 1),
+                        num_heads))  # 2*Wd-1 * 2*Wh-1 * 2*Ww-1, nH

        # get pair-wise relative position index for each token inside the window
        coords_d = torch.arange(self.window_size[0])
--- a/modelscope/msdatasets/task_datasets/init.py
+++ b/modelscope/msdatasets/task_datasets/init.py
@@ -13,6 +13,7 @@ if TYPE_CHECKING:
    from .video_summarization_dataset import VideoSummarizationDataset
    from .image_inpainting import ImageInpaintingDataset
    from .text_ranking_dataset import TextRankingDataset
+    from .referring_video_object_segmentation import ReferringVideoObjectSegmentationDataset

 else:
    _import_structure = {
@@ -29,6 +30,8 @@ else:
        'sidd_image_denoising_dataset': ['SiddImageDenoisingDataset'],
        'image_portrait_enhancement_dataset':
        ['ImagePortraitEnhancementDataset'],
+        'referring_video_object_segmentation':
+        ['ReferringVideoObjectSegmentationDataset'],
    }
    import sys

--- a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/init.py
+++ b/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/init.py
@@ -0,0 +1,3 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+from .referring_video_object_segmentation_dataset import \
+    ReferringVideoObjectSegmentationDataset
--- a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py
+++ b/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/referring_video_object_segmentation_dataset.py
@@ -0,0 +1,361 @@
+# Part of the implementation is borrowed and modified from MTTR,
+# publicly available at https://github.com/mttr2021/MTTR
+
+from glob import glob
+from os import path as osp
+
+import h5py
+import json
+import numpy as np
+import pandas
+import torch
+import torch.distributed as dist
+import torchvision.transforms.functional as F
+from pycocotools.mask import area, encode
+from torchvision.io import read_video
+from tqdm import tqdm
+
+from modelscope.metainfo import Models
+from modelscope.models.cv.referring_video_object_segmentation.utils import \
+    nested_tensor_from_videos_list
+from modelscope.msdatasets.task_datasets.builder import TASK_DATASETS
+from modelscope.msdatasets.task_datasets.torch_base_dataset import \
+    TorchTaskDataset
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from . import transformers as T
+
+LOGGER = get_logger()
+
+
+def get_image_id(video_id, frame_idx, ref_instance_a2d_id):
+    image_id = f'v_{video_id}_f_{frame_idx}_i_{ref_instance_a2d_id}'
+    return image_id
+
+
+@TASK_DATASETS.register_module(
+    Tasks.referring_video_object_segmentation,
+    module_name=Models.referring_video_object_segmentation)
+class ReferringVideoObjectSegmentationDataset(TorchTaskDataset):
+
+    def __init__(self, **kwargs):
+        split_config = kwargs['split_config']
+        LOGGER.info(kwargs)
+        data_cfg = kwargs.get('cfg').data_kwargs
+        trans_cfg = kwargs.get('cfg').transformers_kwargs
+        distributed = data_cfg.get('distributed', False)
+
+        self.data_root = next(iter(split_config.values()))
+        if not osp.exists(self.data_root):
+            self.data_root = osp.dirname(self.data_root)
+            assert osp.exists(self.data_root)
+
+        self.window_size = data_cfg.get('window_size', 8)
+        self.mask_annotations_dir = osp.join(
+            self.data_root, 'text_annotations/annotation_with_instances')
+        self.videos_dir = osp.join(self.data_root, 'Release/CLIPS320')
+        self.subset_type = next(iter(split_config.keys()))
+        self.text_annotations = self.get_text_annotations(
+            self.data_root, self.subset_type, distributed)
+        self.transforms = A2dSentencesTransforms(self.subset_type, **trans_cfg)
+        self.collator = Collator()
+        self.ann_file = osp.join(
+            self.data_root,
+            data_cfg.get('ann_file',
+                         'a2d_sentences_test_annotations_in_coco_format.json'))
+
+        # create ground-truth test annotations for the evaluation process if necessary:
+        if self.subset_type == 'test' and not osp.exists(self.ann_file):
+            if (distributed and dist.get_rank() == 0) or not distributed:
+                create_a2d_sentences_ground_truth_test_annotations(
+                    self.data_root, self.subset_type,
+                    self.mask_annotations_dir, self.ann_file)
+            if distributed:
+                dist.barrier()
+
+    def __len__(self):
+        return len(self.text_annotations)
+
+    def __getitem__(self, idx):
+        text_query, video_id, frame_idx, instance_id = self.text_annotations[
+            idx]
+
+        text_query = ' '.join(
+            text_query.lower().split())  # clean up the text query
+
+        # read the source window frames:
+        video_frames, _, _ = read_video(
+            osp.join(self.videos_dir, f'{video_id}.mp4'),
+            pts_unit='sec')  # (T, H, W, C)
+        # get a window of window_size frames with frame frame_idx in the middle.
+        # note that the original a2d dataset is 1 indexed, so we have to subtract 1 from frame_idx
+        start_idx, end_idx = frame_idx - 1 - self.window_size // 2, frame_idx - 1 + (
+            self.window_size + 1) // 2
+
+        # extract the window source frames:
+        source_frames = []
+        for i in range(start_idx, end_idx):
+            i = min(max(i, 0),
+                    len(video_frames)
+                    - 1)  # pad out of range indices with edge frames
+            source_frames.append(
+                F.to_pil_image(video_frames[i].permute(2, 0, 1)))
+
+        # read the instance mask:
+        frame_annot_path = osp.join(self.mask_annotations_dir, video_id,
+                                    f'{frame_idx:05d}.h5')
+        f = h5py.File(frame_annot_path, 'r')
+        instances = list(f['instance'])
+        instance_idx = instances.index(
+            instance_id)  # existence was already validated during init
+
+        instance_masks = np.array(f['reMask'])
+        if len(instances) == 1:
+            instance_masks = instance_masks[np.newaxis, ...]
+        instance_masks = torch.tensor(instance_masks).transpose(1, 2)
+        mask_rles = [encode(mask) for mask in instance_masks.numpy()]
+        mask_areas = area(mask_rles).astype(np.float)
+        f.close()
+
+        # create the target dict for the center frame:
+        target = {
+            'masks': instance_masks,
+            'orig_size': instance_masks.
+            shape[-2:],  # original frame shape without any augmentations
+            # size with augmentations, will be changed inside transforms if necessary
+            'size': instance_masks.shape[-2:],
+            'referred_instance_idx': torch.tensor(
+                instance_idx),  # idx in 'masks' of the text referred instance
+            'area': torch.tensor(mask_areas),
+            'iscrowd':
+            torch.zeros(len(instance_masks)
+                        ),  # for compatibility with DETR COCO transforms
+            'image_id': get_image_id(video_id, frame_idx, instance_id)
+        }
+
+        # create dummy targets for adjacent frames:
+        targets = self.window_size * [None]
+        center_frame_idx = self.window_size // 2
+        targets[center_frame_idx] = target
+        source_frames, targets, text_query = self.transforms(
+            source_frames, targets, text_query)
+        return source_frames, targets, text_query
+
+    @staticmethod
+    def get_text_annotations(root_path, subset, distributed):
+        saved_annotations_file_path = osp.join(
+            root_path, f'sentences_single_frame_{subset}_annotations.json')
+        if osp.exists(saved_annotations_file_path):
+            with open(saved_annotations_file_path, 'r') as f:
+                text_annotations_by_frame = [tuple(a) for a in json.load(f)]
+                return text_annotations_by_frame
+        elif (distributed and dist.get_rank() == 0) or not distributed:
+            print(f'building a2d sentences {subset} text annotations...')
+            # without 'header == None' pandas will ignore the first sample...
+            a2d_data_info = pandas.read_csv(
+                osp.join(root_path, 'Release/videoset.csv'), header=None)
+            # 'vid', 'label', 'start_time', 'end_time', 'height', 'width', 'total_frames', 'annotated_frames', 'subset'
+            a2d_data_info.columns = [
+                'vid', '', '', '', '', '', '', '', 'subset'
+            ]
+            with open(
+                    osp.join(root_path, 'text_annotations/missed_videos.txt'),
+                    'r') as f:
+                unused_videos = f.read().splitlines()
+            subsets = {'train': 0, 'test': 1}
+            # filter unused videos and videos which do not belong to our train/test subset:
+            used_videos = a2d_data_info[
+                ~a2d_data_info.vid.isin(unused_videos)
+                & (a2d_data_info.subset == subsets[subset])]
+            used_videos_ids = list(used_videos['vid'])
+            text_annotations = pandas.read_csv(
+                osp.join(root_path, 'text_annotations/annotation.txt'))
+            # filter the text annotations based on the used videos:
+            used_text_annotations = text_annotations[
+                text_annotations.video_id.isin(used_videos_ids)]
+            # remove a single dataset annotation mistake in video: T6bNPuKV-wY
+            used_text_annotations = used_text_annotations[
+                used_text_annotations['instance_id'] != '1 (copy)']
+            # convert data-frame to list of tuples:
+            used_text_annotations = list(
+                used_text_annotations.to_records(index=False))
+            text_annotations_by_frame = []
+            mask_annotations_dir = osp.join(
+                root_path, 'text_annotations/annotation_with_instances')
+            for video_id, instance_id, text_query in tqdm(
+                    used_text_annotations):
+                frame_annot_paths = sorted(
+                    glob(osp.join(mask_annotations_dir, video_id, '*.h5')))
+                instance_id = int(instance_id)
+                for p in frame_annot_paths:
+                    f = h5py.File(p)
+                    instances = list(f['instance'])
+                    if instance_id in instances:
+                        # in case this instance does not appear in this frame it has no ground-truth mask, and thus this
+                        # frame-instance pair is ignored in evaluation, same as SOTA method: CMPC-V. check out:
+                        # https://github.com/spyflying/CMPC-Refseg/blob/094639b8bf00cc169ea7b49cdf9c87fdfc70d963/CMPC_video/build_A2D_batches.py#L98
+                        frame_idx = int(p.split('/')[-1].split('.')[0])
+                        text_query = text_query.lower(
+                        )  # lower the text query prior to augmentation & tokenization
+                        text_annotations_by_frame.append(
+                            (text_query, video_id, frame_idx, instance_id))
+            with open(saved_annotations_file_path, 'w') as f:
+                json.dump(text_annotations_by_frame, f)
+        if distributed:
+            dist.barrier()
+            with open(saved_annotations_file_path, 'r') as f:
+                text_annotations_by_frame = [tuple(a) for a in json.load(f)]
+        return text_annotations_by_frame
+
+
+class A2dSentencesTransforms:
+
+    def __init__(self, subset_type, horizontal_flip_augmentations,
+                 resize_and_crop_augmentations, train_short_size,
+                 train_max_size, eval_short_size, eval_max_size, **kwargs):
+        self.h_flip_augmentation = subset_type == 'train' and horizontal_flip_augmentations
+        normalize = T.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
+        scales = [
+            train_short_size
+        ]  # no more scales for now due to GPU memory constraints. might be changed later
+        transforms = []
+        if resize_and_crop_augmentations:
+            if subset_type == 'train':
+                transforms.append(
+                    T.RandomResize(scales, max_size=train_max_size))
+            elif subset_type == 'test':
+                transforms.append(
+                    T.RandomResize([eval_short_size], max_size=eval_max_size)),
+        transforms.extend([T.ToTensor(), normalize])
+        self.size_transforms = T.Compose(transforms)
+
+    def __call__(self, source_frames, targets, text_query):
+        if self.h_flip_augmentation and torch.rand(1) > 0.5:
+            source_frames = [F.hflip(f) for f in source_frames]
+            targets[len(targets) // 2]['masks'] = F.hflip(
+                targets[len(targets) // 2]['masks'])
+            # Note - is it possible for both 'right' and 'left' to appear together in the same query. hence this fix:
+            text_query = text_query.replace('left', '@').replace(
+                'right', 'left').replace('@', 'right')
+        source_frames, targets = list(
+            zip(*[
+                self.size_transforms(f, t)
+                for f, t in zip(source_frames, targets)
+            ]))
+        source_frames = torch.stack(source_frames)  # [T, 3, H, W]
+        return source_frames, targets, text_query
+
+
+class Collator:
+
+    def __call__(self, batch):
+        samples, targets, text_queries = list(zip(*batch))
+        samples = nested_tensor_from_videos_list(samples)  # [T, B, C, H, W]
+        # convert targets to a list of tuples. outer list - time steps, inner tuples - time step batch
+        targets = list(zip(*targets))
+        batch_dict = {
+            'samples': samples,
+            'targets': targets,
+            'text_queries': text_queries
+        }
+        return batch_dict
+
+
+def get_text_annotations_gt(root_path, subset):
+    # without 'header == None' pandas will ignore the first sample...
+    a2d_data_info = pandas.read_csv(
+        osp.join(root_path, 'Release/videoset.csv'), header=None)
+    # 'vid', 'label', 'start_time', 'end_time', 'height', 'width', 'total_frames', 'annotated_frames', 'subset'
+    a2d_data_info.columns = ['vid', '', '', '', '', '', '', '', 'subset']
+    with open(osp.join(root_path, 'text_annotations/missed_videos.txt'),
+              'r') as f:
+        unused_videos = f.read().splitlines()
+    subsets = {'train': 0, 'test': 1}
+    # filter unused videos and videos which do not belong to our train/test subset:
+    used_videos = a2d_data_info[~a2d_data_info.vid.isin(unused_videos)
+                                & (a2d_data_info.subset == subsets[subset])]
+    used_videos_ids = list(used_videos['vid'])
+    text_annotations = pandas.read_csv(
+        osp.join(root_path, 'text_annotations/annotation.txt'))
+    # filter the text annotations based on the used videos:
+    used_text_annotations = text_annotations[text_annotations.video_id.isin(
+        used_videos_ids)]
+    # convert data-frame to list of tuples:
+    used_text_annotations = list(used_text_annotations.to_records(index=False))
+    return used_text_annotations
+
+
+def create_a2d_sentences_ground_truth_test_annotations(dataset_path,
+                                                       subset_type,
+                                                       mask_annotations_dir,
+                                                       output_path):
+    text_annotations = get_text_annotations_gt(dataset_path, subset_type)
+
+    # Note - it is very important to start counting the instance and category ids from 1 (not 0). This is implicitly
+    # expected by pycocotools as it is the convention of the original coco dataset annotations.
+
+    categories_dict = [{
+        'id': 1,
+        'name': 'dummy_class'
+    }]  # dummy class, as categories are not used/predicted in RVOS
+
+    images_dict = []
+    annotations_dict = []
+    images_set = set()
+    instance_id_counter = 1
+    for annot in tqdm(text_annotations):
+        video_id, instance_id, text_query = annot
+        annot_paths = sorted(
+            glob(osp.join(mask_annotations_dir, video_id, '*.h5')))
+        for p in annot_paths:
+            f = h5py.File(p)
+            instances = list(f['instance'])
+            try:
+                instance_idx = instances.index(int(instance_id))
+            # in case this instance does not appear in this frame it has no ground-truth mask, and thus this
+            # frame-instance pair is ignored in evaluation, same as SOTA method: CMPC-V. check out:
+            # https://github.com/spyflying/CMPC-Refseg/blob/094639b8bf00cc169ea7b49cdf9c87fdfc70d963/CMPC_video/build_A2D_batches.py#L98
+            except ValueError:
+                continue  # instance_id does not appear in current frame
+            mask = f['reMask'][instance_idx] if len(
+                instances) > 1 else np.array(f['reMask'])
+            mask = mask.transpose()
+
+            frame_idx = int(p.split('/')[-1].split('.')[0])
+            image_id = get_image_id(video_id, frame_idx, instance_id)
+            assert image_id not in images_set, f'error: image id: {image_id} appeared twice'
+            images_set.add(image_id)
+            images_dict.append({
+                'id': image_id,
+                'height': mask.shape[0],
+                'width': mask.shape[1]
+            })
+
+            mask_rle = encode(mask)
+            mask_rle['counts'] = mask_rle['counts'].decode('ascii')
+            mask_area = float(area(mask_rle))
+            bbox = f['reBBox'][:, instance_idx] if len(
+                instances) > 1 else np.array(
+                    f['reBBox']).squeeze()  # x1y1x2y2 form
+            bbox_xywh = [
+                bbox[0], bbox[1], bbox[2] - bbox[0], bbox[3] - bbox[1]
+            ]
+            instance_annot = {
+                'id': instance_id_counter,
+                'image_id': image_id,
+                'category_id':
+                1,  # dummy class, as categories are not used/predicted in ref-vos
+                'segmentation': mask_rle,
+                'area': mask_area,
+                'bbox': bbox_xywh,
+                'iscrowd': 0,
+            }
+            annotations_dict.append(instance_annot)
+            instance_id_counter += 1
+    dataset_dict = {
+        'categories': categories_dict,
+        'images': images_dict,
+        'annotations': annotations_dict
+    }
+    with open(output_path, 'w') as f:
+        json.dump(dataset_dict, f)
--- a/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/transformers.py
+++ b/modelscope/msdatasets/task_datasets/referring_video_object_segmentation/transformers.py
@@ -0,0 +1,294 @@
+# The implementation is adopted from MTTR,
+# made publicly available under the Apache 2.0 License at https://github.com/mttr2021/MTTR
+# Modified from DETR https://github.com/facebookresearch/detr
+
+import random
+
+import PIL
+import torch
+import torchvision.transforms as T
+import torchvision.transforms.functional as F
+
+from modelscope.models.cv.referring_video_object_segmentation.utils import \
+    interpolate
+
+
+def crop(image, target, region):
+    cropped_image = F.crop(image, *region)
+
+    target = target.copy()
+    i, j, h, w = region
+
+    # should we do something wrt the original size?
+    target['size'] = torch.tensor([h, w])
+
+    fields = ['labels', 'area', 'iscrowd']
+
+    if 'boxes' in target:
+        boxes = target['boxes']
+        max_size = torch.as_tensor([w, h], dtype=torch.float32)
+        cropped_boxes = boxes - torch.as_tensor([j, i, j, i])
+        cropped_boxes = torch.min(cropped_boxes.reshape(-1, 2, 2), max_size)
+        cropped_boxes = cropped_boxes.clamp(min=0)
+        area = (cropped_boxes[:, 1, :] - cropped_boxes[:, 0, :]).prod(dim=1)
+        target['boxes'] = cropped_boxes.reshape(-1, 4)
+        target['area'] = area
+        fields.append('boxes')
+
+    if 'masks' in target:
+        # FIXME should we update the area here if there are no boxes?
+        target['masks'] = target['masks'][:, i:i + h, j:j + w]
+        fields.append('masks')
+
+    # remove elements for which the boxes or masks that have zero area
+    if 'boxes' in target or 'masks' in target:
+        # favor boxes selection when defining which elements to keep
+        # this is compatible with previous implementation
+        if 'boxes' in target:
+            cropped_boxes = target['boxes'].reshape(-1, 2, 2)
+            keep = torch.all(
+                cropped_boxes[:, 1, :] > cropped_boxes[:, 0, :], dim=1)
+        else:
+            keep = target['masks'].flatten(1).any(1)
+
+        for field in fields:
+            target[field] = target[field][keep]
+
+    return cropped_image, target
+
+
+def hflip(image, target):
+    flipped_image = F.hflip(image)
+
+    w, h = image.size
+
+    target = target.copy()
+    if 'boxes' in target:
+        boxes = target['boxes']
+        boxes = boxes[:, [2, 1, 0, 3]] * torch.as_tensor(
+            [-1, 1, -1, 1]) + torch.as_tensor([w, 0, w, 0])
+        target['boxes'] = boxes
+
+    if 'masks' in target:
+        target['masks'] = target['masks'].flip(-1)
+
+    return flipped_image, target
+
+
+def resize(image, target, size, max_size=None):
+    # size can be min_size (scalar) or (w, h) tuple
+
+    def get_size_with_aspect_ratio(image_size, size, max_size=None):
+        w, h = image_size
+        if max_size is not None:
+            min_original_size = float(min((w, h)))
+            max_original_size = float(max((w, h)))
+            if max_original_size / min_original_size * size > max_size:
+                size = int(
+                    round(max_size * min_original_size / max_original_size))
+
+        if (w <= h and w == size) or (h <= w and h == size):
+            return (h, w)
+
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+        else:
+            oh = size
+            ow = int(size * w / h)
+
+        return (oh, ow)
+
+    def get_size(image_size, size, max_size=None):
+        if isinstance(size, (list, tuple)):
+            return size[::-1]
+        else:
+            return get_size_with_aspect_ratio(image_size, size, max_size)
+
+    size = get_size(image.size, size, max_size)
+    rescaled_image = F.resize(image, size)
+
+    if target is None:
+        return rescaled_image, None
+
+    ratios = tuple(
+        float(s) / float(s_orig)
+        for s, s_orig in zip(rescaled_image.size, image.size))
+    ratio_width, ratio_height = ratios
+
+    target = target.copy()
+    if 'boxes' in target:
+        boxes = target['boxes']
+        scaled_boxes = boxes * torch.as_tensor(
+            [ratio_width, ratio_height, ratio_width, ratio_height])
+        target['boxes'] = scaled_boxes
+
+    if 'area' in target:
+        area = target['area']
+        scaled_area = area * (ratio_width * ratio_height)
+        target['area'] = scaled_area
+
+    h, w = size
+    target['size'] = torch.tensor([h, w])
+
+    if 'masks' in target:
+        target['masks'] = interpolate(
+            target['masks'][:, None].float(), size, mode='nearest')[:, 0] > 0.5
+
+    return rescaled_image, target
+
+
+def pad(image, target, padding):
+    # assumes that we only pad on the bottom right corners
+    padded_image = F.pad(image, (0, 0, padding[0], padding[1]))
+    if target is None:
+        return padded_image, None
+    target = target.copy()
+    # should we do something wrt the original size?
+    target['size'] = torch.tensor(padded_image.size[::-1])
+    if 'masks' in target:
+        target['masks'] = torch.nn.functional.pad(
+            target['masks'], (0, padding[0], 0, padding[1]))
+    return padded_image, target
+
+
+class RandomCrop(object):
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        region = T.RandomCrop.get_params(img, self.size)
+        return crop(img, target, region)
+
+
+class RandomSizeCrop(object):
+
+    def __init__(self, min_size: int, max_size: int):
+        self.min_size = min_size
+        self.max_size = max_size
+
+    def __call__(self, img: PIL.Image.Image, target: dict):
+        w = random.randint(self.min_size, min(img.width, self.max_size))
+        h = random.randint(self.min_size, min(img.height, self.max_size))
+        region = T.RandomCrop.get_params(img, [h, w])
+        return crop(img, target, region)
+
+
+class CenterCrop(object):
+
+    def __init__(self, size):
+        self.size = size
+
+    def __call__(self, img, target):
+        image_width, image_height = img.size
+        crop_height, crop_width = self.size
+        crop_top = int(round((image_height - crop_height) / 2.))
+        crop_left = int(round((image_width - crop_width) / 2.))
+        return crop(img, target,
+                    (crop_top, crop_left, crop_height, crop_width))
+
+
+class RandomHorizontalFlip(object):
+
+    def __init__(self, p=0.5):
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return hflip(img, target)
+        return img, target
+
+
+class RandomResize(object):
+
+    def __init__(self, sizes, max_size=None):
+        assert isinstance(sizes, (list, tuple))
+        self.sizes = sizes
+        self.max_size = max_size
+
+    def __call__(self, img, target=None):
+        size = random.choice(self.sizes)
+        return resize(img, target, size, self.max_size)
+
+
+class RandomPad(object):
+
+    def __init__(self, max_pad):
+        self.max_pad = max_pad
+
+    def __call__(self, img, target):
+        pad_x = random.randint(0, self.max_pad)
+        pad_y = random.randint(0, self.max_pad)
+        return pad(img, target, (pad_x, pad_y))
+
+
+class RandomSelect(object):
+    """
+    Randomly selects between transforms1 and transforms2,
+    with probability p for transforms1 and (1 - p) for transforms2
+    """
+
+    def __init__(self, transforms1, transforms2, p=0.5):
+        self.transforms1 = transforms1
+        self.transforms2 = transforms2
+        self.p = p
+
+    def __call__(self, img, target):
+        if random.random() < self.p:
+            return self.transforms1(img, target)
+        return self.transforms2(img, target)
+
+
+class ToTensor(object):
+
+    def __call__(self, img, target):
+        return F.to_tensor(img), target
+
+
+class RandomErasing(object):
+
+    def __init__(self, *args, **kwargs):
+        self.eraser = T.RandomErasing(*args, **kwargs)
+
+    def __call__(self, img, target):
+        return self.eraser(img), target
+
+
+class Normalize(object):
+
+    def __init__(self, mean, std):
+        self.mean = mean
+        self.std = std
+
+    def __call__(self, image, target=None):
+        image = F.normalize(image, mean=self.mean, std=self.std)
+        if target is None:
+            return image, None
+        target = target.copy()
+        h, w = image.shape[-2:]
+        if 'boxes' in target:
+            boxes = target['boxes']
+            boxes = box_xyxy_to_cxcywh(boxes)
+            boxes = boxes / torch.tensor([w, h, w, h], dtype=torch.float32)
+            target['boxes'] = boxes
+        return image, target
+
+
+class Compose(object):
+
+    def __init__(self, transforms):
+        self.transforms = transforms
+
+    def __call__(self, image, target):
+        for t in self.transforms:
+            image, target = t(image, target)
+        return image, target
+
+    def __repr__(self):
+        format_string = self.__class__.__name__ + '('
+        for t in self.transforms:
+            format_string += '\n'
+            format_string += '    {0}'.format(t)
+        format_string += '\n)'
+        return format_string
--- a/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
+++ b/modelscope/pipelines/cv/referring_video_object_segmentation_pipeline.py
@@ -157,7 +157,13 @@ class ReferringVideoObjectSegmentationPipeline(Pipeline):
                            * text_border_height_per_query, 0, 0))
                W, H = vid_frame.size
                draw = ImageDraw.Draw(vid_frame)
-                font = ImageFont.truetype(font='DejaVuSansMono.ttf', size=30)
+
+                if self.model.cfg.pipeline.output_font:
+                    font = ImageFont.truetype(
+                        font=self.model.cfg.pipeline.output_font,
+                        size=self.model.cfg.pipeline.output_font_size)
+                else:
+                    font = ImageFont.load_default()
                for i, (text_query, color) in enumerate(
                        zip(self.text_queries, colors), start=1):
                    w, h = draw.textsize(text_query, font=font)
--- a/modelscope/trainers/init.py
+++ b/modelscope/trainers/init.py
@@ -9,7 +9,8 @@ if TYPE_CHECKING:
    from .builder import build_trainer
    from .cv import (ImageInstanceSegmentationTrainer,
                     ImagePortraitEnhancementTrainer,
-                     MovieSceneSegmentationTrainer, ImageInpaintingTrainer)
+                     MovieSceneSegmentationTrainer, ImageInpaintingTrainer,
+                     ReferringVideoObjectSegmentationTrainer)
    from .multi_modal import CLIPTrainer
    from .nlp import SequenceClassificationTrainer, TextRankingTrainer
    from .nlp_trainer import NlpEpochBasedTrainer, VecoTrainer, NlpTrainerArguments
--- a/modelscope/trainers/cv/init.py
+++ b/modelscope/trainers/cv/init.py
@@ -9,6 +9,7 @@ if TYPE_CHECKING:
    from .image_portrait_enhancement_trainer import ImagePortraitEnhancementTrainer
    from .movie_scene_segmentation_trainer import MovieSceneSegmentationTrainer
    from .image_inpainting_trainer import ImageInpaintingTrainer
+    from .referring_video_object_segmentation_trainer import ReferringVideoObjectSegmentationTrainer

 else:
    _import_structure = {
@@ -17,7 +18,9 @@ else:
        'image_portrait_enhancement_trainer':
        ['ImagePortraitEnhancementTrainer'],
        'movie_scene_segmentation_trainer': ['MovieSceneSegmentationTrainer'],
-        'image_inpainting_trainer': ['ImageInpaintingTrainer']
+        'image_inpainting_trainer': ['ImageInpaintingTrainer'],
+        'referring_video_object_segmentation_trainer':
+        ['ReferringVideoObjectSegmentationTrainer']
    }

    import sys
--- a/modelscope/trainers/cv/referring_video_object_segmentation_trainer.py
+++ b/modelscope/trainers/cv/referring_video_object_segmentation_trainer.py
@@ -0,0 +1,63 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+
+import torch
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers.builder import TRAINERS
+from modelscope.trainers.trainer import EpochBasedTrainer
+from modelscope.utils.constant import ModeKeys
+
+
+@TRAINERS.register_module(
+    module_name=Trainers.referring_video_object_segmentation)
+class ReferringVideoObjectSegmentationTrainer(EpochBasedTrainer):
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.model.set_postprocessor(self.cfg.dataset.name)
+        self.train_data_collator = self.train_dataset.collator
+        self.eval_data_collator = self.eval_dataset.collator
+
+        device_name = kwargs.get('device', 'gpu')
+        self.model.set_device(self.device, device_name)
+
+    def train(self, *args, **kwargs):
+        self.model.criterion.train()
+        super().train(*args, **kwargs)
+
+    def evaluate(self, checkpoint_path=None):
+        if checkpoint_path is not None and os.path.isfile(checkpoint_path):
+            from modelscope.trainers.hooks import CheckpointHook
+            CheckpointHook.load_checkpoint(checkpoint_path, self)
+        self.model.eval()
+        self._mode = ModeKeys.EVAL
+        if self.eval_dataset is None:
+            self.eval_dataloader = self.get_eval_data_loader()
+        else:
+            self.eval_dataloader = self._build_dataloader_with_dataset(
+                self.eval_dataset,
+                dist=self._dist,
+                seed=self._seed,
+                collate_fn=self.eval_data_collator,
+                **self.cfg.evaluation.get('dataloader', {}))
+        self.data_loader = self.eval_dataloader
+
+        from modelscope.metrics import build_metric
+        ann_file = self.eval_dataset.ann_file
+        metric_classes = []
+        for metric in self.metrics:
+            metric.update({'ann_file': ann_file})
+            metric_classes.append(build_metric(metric))
+
+        for m in metric_classes:
+            m.trainer = self
+
+        metric_values = self.evaluation_loop(self.eval_dataloader,
+                                             metric_classes)
+
+        self._metric_values = metric_values
+        return metric_values
+
+    def prediction_step(self, model, inputs):
+        pass
--- a/modelscope/trainers/utils/inference.py
+++ b/modelscope/trainers/utils/inference.py
@@ -62,7 +62,10 @@ def single_gpu_test(trainer,
                    if 'nsentences' in data:
                        batch_size = data['nsentences']
                    else:
-                        batch_size = len(next(iter(data.values())))
+                        try:
+                            batch_size = len(next(iter(data.values())))
+                        except Exception:
+                            batch_size = data_loader.batch_size
                else:
                    batch_size = len(data)
            for _ in range(batch_size):
--- a/tests/trainers/test_referring_video_object_segmentation_trainer.py
+++ b/tests/trainers/test_referring_video_object_segmentation_trainer.py
@@ -0,0 +1,101 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+import os
+import shutil
+import tempfile
+import unittest
+import zipfile
+
+from modelscope.hub.snapshot_download import snapshot_download
+from modelscope.metainfo import Trainers
+from modelscope.models.cv.movie_scene_segmentation import \
+    MovieSceneSegmentationModel
+from modelscope.msdatasets import MsDataset
+from modelscope.trainers import build_trainer
+from modelscope.utils.config import Config, ConfigDict
+from modelscope.utils.constant import ModelFile
+from modelscope.utils.test_utils import test_level
+
+
+class TestImageInstanceSegmentationTrainer(unittest.TestCase):
+
+    model_id = 'damo/cv_swin-t_referring_video-object-segmentation'
+    dataset_name = 'referring_vos_toydata'
+
+    def setUp(self):
+        print(('Testing %s.%s' % (type(self).__name__, self._testMethodName)))
+
+        cache_path = snapshot_download(self.model_id)
+        config_path = os.path.join(cache_path, ModelFile.CONFIGURATION)
+        cfg = Config.from_file(config_path)
+
+        max_epochs = cfg.train.max_epochs
+
+        train_data_cfg = ConfigDict(
+            name=self.dataset_name,
+            split='train',
+            test_mode=False,
+            cfg=cfg.dataset)
+
+        test_data_cfg = ConfigDict(
+            name=self.dataset_name,
+            split='test',
+            test_mode=True,
+            cfg=cfg.dataset)
+
+        self.train_dataset = MsDataset.load(
+            dataset_name=train_data_cfg.name,
+            split=train_data_cfg.split,
+            cfg=train_data_cfg.cfg,
+            namespace='damo',
+            test_mode=train_data_cfg.test_mode)
+        assert next(
+            iter(self.train_dataset.config_kwargs['split_config'].values()))
+
+        self.test_dataset = MsDataset.load(
+            dataset_name=test_data_cfg.name,
+            split=test_data_cfg.split,
+            cfg=test_data_cfg.cfg,
+            namespace='damo',
+            test_mode=test_data_cfg.test_mode)
+        assert next(
+            iter(self.test_dataset.config_kwargs['split_config'].values()))
+
+        self.max_epochs = max_epochs
+
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_trainer(self):
+        kwargs = dict(
+            model=self.model_id,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir='./work_dir')
+
+        trainer = build_trainer(
+            name=Trainers.referring_video_object_segmentation,
+            default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+    @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
+    def test_trainer_with_model_and_args(self):
+
+        cache_path = snapshot_download(self.model_id)
+        model = MovieSceneSegmentationModel.from_pretrained(cache_path)
+        kwargs = dict(
+            cfg_file=os.path.join(cache_path, ModelFile.CONFIGURATION),
+            model=model,
+            train_dataset=self.train_dataset,
+            eval_dataset=self.test_dataset,
+            work_dir='./work_dir')
+
+        trainer = build_trainer(
+            name=Trainers.referring_video_object_segmentation,
+            default_args=kwargs)
+        trainer.train()
+        results_files = os.listdir(trainer.work_dir)
+        self.assertIn(f'{trainer.timestamp}.log.json', results_files)
+
+
+if __name__ == '__main__':
+    unittest.main()