From e57424eaf0979917d96a1994de2342c1de3d4d0d Mon Sep 17 00:00:00 2001
From: wjq264216 <wjq264216@alibaba-inc.com>
Date: Wed, 28 Dec 2022 06:26:15 +0800
Subject: [PATCH] [to #42322933] Add ocr-detection-vlpt-pipeline to maas lib   
      Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11213036

---
 data/test/images/ocr_detection_vlpt.jpg       |   3 +
 .../pipelines/cv/ocr_detection_pipeline.py    | 271 ++++++-----
 modelscope/pipelines/cv/ocr_utils/__init__.py |   7 +-
 .../pipelines/cv/ocr_utils/model_vlpt.py      | 431 ++++++++++++++++++
 modelscope/pipelines/cv/ocr_utils/utils.py    | 101 ++++
 tests/pipelines/test_ocr_detection.py         |   7 +
 6 files changed, 712 insertions(+), 108 deletions(-)
 create mode 100644 data/test/images/ocr_detection_vlpt.jpg
 create mode 100644 modelscope/pipelines/cv/ocr_utils/model_vlpt.py

diff --git a/data/test/images/ocr_detection_vlpt.jpg b/data/test/images/ocr_detection_vlpt.jpg
new file mode 100644
index 00000000..e6e14e28
--- /dev/null
+++ b/data/test/images/ocr_detection_vlpt.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3f24570355f178d2a8226112d1443d735837e59573545cfff12458dd791ae341
+size 308158
diff --git a/modelscope/pipelines/cv/ocr_detection_pipeline.py b/modelscope/pipelines/cv/ocr_detection_pipeline.py
index 292ec2c5..682b05c4 100644
--- a/modelscope/pipelines/cv/ocr_detection_pipeline.py
+++ b/modelscope/pipelines/cv/ocr_detection_pipeline.py
@@ -1,22 +1,25 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
+import math
 import os.path as osp
 from typing import Any, Dict
 
 import cv2
 import numpy as np
 import tensorflow as tf
+import torch
 
 from modelscope.metainfo import Pipelines
 from modelscope.outputs import OutputKeys
 from modelscope.pipelines.base import Input, Pipeline
 from modelscope.pipelines.builder import PIPELINES
+from modelscope.pipelines.cv.ocr_utils.model_vlpt import VLPTModel
 from modelscope.preprocessors import LoadImage
 from modelscope.utils.constant import ModelFile, Tasks
 from modelscope.utils.device import device_placement
 from modelscope.utils.logger import get_logger
 from .ocr_utils import (SegLinkDetector, cal_width, combine_segments_python,
                         decode_segments_links_python, nms_python,
-                        rboxes_to_polygons)
+                        polygons_from_bitmap, rboxes_to_polygons)
 
 if tf.__version__ >= '2.0':
     import tf_slim as slim
@@ -53,132 +56,188 @@ class OCRDetectionPipeline(Pipeline):
             model: model id on modelscope hub.
         """
         super().__init__(model=model, **kwargs)
-        tf.reset_default_graph()
-        model_path = osp.join(
-            osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
-            'checkpoint-80000')
-        self._graph = tf.get_default_graph()
-        config = tf.ConfigProto(allow_soft_placement=True)
-        config.gpu_options.allow_growth = True
-        self._session = tf.Session(config=config)
+        if 'vlpt' in self.model:
+            model_path = osp.join(self.model, ModelFile.TORCH_MODEL_FILE)
+            logger.info(f'loading model from {model_path}')
 
-        with self._graph.as_default():
-            with device_placement(self.framework, self.device_name):
-                self.input_images = tf.placeholder(
-                    tf.float32, shape=[1, 1024, 1024, 3], name='input_images')
-                self.output = {}
+            self.thresh = 0.3
+            self.image_short_side = 736
+            self.device = torch.device(
+                'cuda' if torch.cuda.is_available() else 'cpu')
+            self.infer_model = VLPTModel().to(self.device)
+            self.infer_model.eval()
+            checkpoint = torch.load(model_path, map_location=self.device)
+            if 'state_dict' in checkpoint:
+                self.infer_model.load_state_dict(checkpoint['state_dict'])
+            else:
+                self.infer_model.load_state_dict(checkpoint)
+        else:
+            tf.reset_default_graph()
+            model_path = osp.join(
+                osp.join(self.model, ModelFile.TF_CHECKPOINT_FOLDER),
+                'checkpoint-80000')
+            self._graph = tf.get_default_graph()
+            config = tf.ConfigProto(allow_soft_placement=True)
+            config.gpu_options.allow_growth = True
+            self._session = tf.Session(config=config)
 
-                with tf.variable_scope('', reuse=tf.AUTO_REUSE):
-                    global_step = tf.get_variable(
-                        'global_step', [],
-                        initializer=tf.constant_initializer(0),
-                        dtype=tf.int64,
-                        trainable=False)
-                    variable_averages = tf.train.ExponentialMovingAverage(
-                        0.997, global_step)
+            with self._graph.as_default():
+                with device_placement(self.framework, self.device_name):
+                    self.input_images = tf.placeholder(
+                        tf.float32,
+                        shape=[1, 1024, 1024, 3],
+                        name='input_images')
+                    self.output = {}
 
-                    # detector
-                    detector = SegLinkDetector()
-                    all_maps = detector.build_model(
-                        self.input_images, is_training=False)
+                    with tf.variable_scope('', reuse=tf.AUTO_REUSE):
+                        global_step = tf.get_variable(
+                            'global_step', [],
+                            initializer=tf.constant_initializer(0),
+                            dtype=tf.int64,
+                            trainable=False)
+                        variable_averages = tf.train.ExponentialMovingAverage(
+                            0.997, global_step)
 
-                    # decode local predictions
-                    all_nodes, all_links, all_reg = [], [], []
-                    for i, maps in enumerate(all_maps):
-                        cls_maps, lnk_maps, reg_maps = maps[0], maps[1], maps[
-                            2]
-                        reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
+                        # detector
+                        detector = SegLinkDetector()
+                        all_maps = detector.build_model(
+                            self.input_images, is_training=False)
 
-                        cls_prob = tf.nn.softmax(tf.reshape(cls_maps, [-1, 2]))
+                        # decode local predictions
+                        all_nodes, all_links, all_reg = [], [], []
+                        for i, maps in enumerate(all_maps):
+                            cls_maps, lnk_maps, reg_maps = maps[0], maps[
+                                1], maps[2]
+                            reg_maps = tf.multiply(reg_maps, OFFSET_VARIANCE)
 
-                        lnk_prob_pos = tf.nn.softmax(
-                            tf.reshape(lnk_maps, [-1, 4])[:, :2])
-                        lnk_prob_mut = tf.nn.softmax(
-                            tf.reshape(lnk_maps, [-1, 4])[:, 2:])
-                        lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut],
-                                             axis=1)
+                            cls_prob = tf.nn.softmax(
+                                tf.reshape(cls_maps, [-1, 2]))
 
-                        all_nodes.append(cls_prob)
-                        all_links.append(lnk_prob)
-                        all_reg.append(reg_maps)
+                            lnk_prob_pos = tf.nn.softmax(
+                                tf.reshape(lnk_maps, [-1, 4])[:, :2])
+                            lnk_prob_mut = tf.nn.softmax(
+                                tf.reshape(lnk_maps, [-1, 4])[:, 2:])
+                            lnk_prob = tf.concat([lnk_prob_pos, lnk_prob_mut],
+                                                 axis=1)
 
-                    # decode segments and links
-                    image_size = tf.shape(self.input_images)[1:3]
-                    segments, group_indices, segment_counts, _ = decode_segments_links_python(
-                        image_size,
-                        all_nodes,
-                        all_links,
-                        all_reg,
-                        anchor_sizes=list(detector.anchor_sizes))
+                            all_nodes.append(cls_prob)
+                            all_links.append(lnk_prob)
+                            all_reg.append(reg_maps)
 
-                    # combine segments
-                    combined_rboxes, combined_counts = combine_segments_python(
-                        segments, group_indices, segment_counts)
-                    self.output['combined_rboxes'] = combined_rboxes
-                    self.output['combined_counts'] = combined_counts
+                        # decode segments and links
+                        image_size = tf.shape(self.input_images)[1:3]
+                        segments, group_indices, segment_counts, _ = decode_segments_links_python(
+                            image_size,
+                            all_nodes,
+                            all_links,
+                            all_reg,
+                            anchor_sizes=list(detector.anchor_sizes))
 
-                with self._session.as_default() as sess:
-                    logger.info(f'loading model from {model_path}')
-                    # load model
-                    model_loader = tf.train.Saver(
-                        variable_averages.variables_to_restore())
-                    model_loader.restore(sess, model_path)
+                        # combine segments
+                        combined_rboxes, combined_counts = combine_segments_python(
+                            segments, group_indices, segment_counts)
+                        self.output['combined_rboxes'] = combined_rboxes
+                        self.output['combined_counts'] = combined_counts
+
+                    with self._session.as_default() as sess:
+                        logger.info(f'loading model from {model_path}')
+                        # load model
+                        model_loader = tf.train.Saver(
+                            variable_averages.variables_to_restore())
+                        model_loader.restore(sess, model_path)
 
     def preprocess(self, input: Input) -> Dict[str, Any]:
-        img = LoadImage.convert_to_ndarray(input)
+        if 'vlpt' in self.model:
+            img = LoadImage.convert_to_ndarray(input)[:, :, ::-1]
 
-        h, w, c = img.shape
-        img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
-        img_pad[:h, :w, :] = img
+            height, width, _ = img.shape
+            if height < width:
+                new_height = self.image_short_side
+                new_width = int(
+                    math.ceil(new_height / height * width / 32) * 32)
+            else:
+                new_width = self.image_short_side
+                new_height = int(
+                    math.ceil(new_width / width * height / 32) * 32)
+            resized_img = cv2.resize(img, (new_width, new_height))
 
-        resize_size = 1024
-        img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
-        img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
-        img_pad_resize = img_pad_resize - np.array([123.68, 116.78, 103.94],
-                                                   dtype=np.float32)
+            resized_img = resized_img - np.array([123.68, 116.78, 103.94],
+                                                 dtype=np.float32)
+            resized_img /= 255.
+            resized_img = torch.from_numpy(resized_img).permute(
+                2, 0, 1).float().unsqueeze(0)
 
-        with self._graph.as_default():
-            resize_size = tf.stack([resize_size, resize_size])
-            orig_size = tf.stack([max(h, w), max(h, w)])
-            self.output['orig_size'] = orig_size
-            self.output['resize_size'] = resize_size
+            result = {'img': resized_img, 'org_shape': [height, width]}
+            return result
+        else:
+            img = LoadImage.convert_to_ndarray(input)
 
-        result = {'img': np.expand_dims(img_pad_resize, axis=0)}
-        return result
+            h, w, c = img.shape
+            img_pad = np.zeros((max(h, w), max(h, w), 3), dtype=np.float32)
+            img_pad[:h, :w, :] = img
+
+            resize_size = 1024
+            img_pad_resize = cv2.resize(img_pad, (resize_size, resize_size))
+            img_pad_resize = cv2.cvtColor(img_pad_resize, cv2.COLOR_RGB2BGR)
+            img_pad_resize = img_pad_resize - np.array(
+                [123.68, 116.78, 103.94], dtype=np.float32)
+
+            with self._graph.as_default():
+                resize_size = tf.stack([resize_size, resize_size])
+                orig_size = tf.stack([max(h, w), max(h, w)])
+                self.output['orig_size'] = orig_size
+                self.output['resize_size'] = resize_size
+
+            result = {'img': np.expand_dims(img_pad_resize, axis=0)}
+            return result
 
     def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
-        with self._graph.as_default():
-            with self._session.as_default():
-                feed_dict = {self.input_images: input['img']}
-                sess_outputs = self._session.run(
-                    self.output, feed_dict=feed_dict)
-                return sess_outputs
+        if 'vlpt' in self.model:
+            pred = self.infer_model(input['img'])
+            return {'results': pred, 'org_shape': input['org_shape']}
+        else:
+            with self._graph.as_default():
+                with self._session.as_default():
+                    feed_dict = {self.input_images: input['img']}
+                    sess_outputs = self._session.run(
+                        self.output, feed_dict=feed_dict)
+                    return sess_outputs
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]:
-        rboxes = inputs['combined_rboxes'][0]
-        count = inputs['combined_counts'][0]
-        if count == 0 or count < rboxes.shape[0]:
-            raise Exception('modelscope error: No text detected')
-        rboxes = rboxes[:count, :]
+        if 'vlpt' in self.model:
+            pred = inputs['results'][0]
+            height, width = inputs['org_shape']
+            segmentation = pred > self.thresh
 
-        # convert rboxes to polygons and find its coordinates on the original image
-        orig_h, orig_w = inputs['orig_size']
-        resize_h, resize_w = inputs['resize_size']
-        polygons = rboxes_to_polygons(rboxes)
-        scale_y = float(orig_h) / float(resize_h)
-        scale_x = float(orig_w) / float(resize_w)
+            boxes, scores = polygons_from_bitmap(pred, segmentation, width,
+                                                 height)
+            result = {OutputKeys.POLYGONS: np.array(boxes)}
+            return result
+        else:
+            rboxes = inputs['combined_rboxes'][0]
+            count = inputs['combined_counts'][0]
+            if count == 0 or count < rboxes.shape[0]:
+                raise Exception('modelscope error: No text detected')
+            rboxes = rboxes[:count, :]
 
-        # confine polygons inside image
-        polygons[:, ::2] = np.maximum(
-            0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
-        polygons[:, 1::2] = np.maximum(
-            0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
-        polygons = np.round(polygons).astype(np.int32)
+            # convert rboxes to polygons and find its coordinates on the original image
+            orig_h, orig_w = inputs['orig_size']
+            resize_h, resize_w = inputs['resize_size']
+            polygons = rboxes_to_polygons(rboxes)
+            scale_y = float(orig_h) / float(resize_h)
+            scale_x = float(orig_w) / float(resize_w)
 
-        # nms
-        dt_n9 = [o + [cal_width(o)] for o in polygons.tolist()]
-        dt_nms = nms_python(dt_n9)
-        dt_polygons = np.array([o[:8] for o in dt_nms])
+            # confine polygons inside image
+            polygons[:, ::2] = np.maximum(
+                0, np.minimum(polygons[:, ::2] * scale_x, orig_w - 1))
+            polygons[:, 1::2] = np.maximum(
+                0, np.minimum(polygons[:, 1::2] * scale_y, orig_h - 1))
+            polygons = np.round(polygons).astype(np.int32)
 
-        result = {OutputKeys.POLYGONS: dt_polygons}
-        return result
+            # nms
+            dt_n9 = [o + [cal_width(o)] for o in polygons.tolist()]
+            dt_nms = nms_python(dt_n9)
+            dt_polygons = np.array([o[:8] for o in dt_nms])
+
+            result = {OutputKeys.POLYGONS: dt_polygons}
+            return result
diff --git a/modelscope/pipelines/cv/ocr_utils/__init__.py b/modelscope/pipelines/cv/ocr_utils/__init__.py
index 312445a9..979ea82c 100644
--- a/modelscope/pipelines/cv/ocr_utils/__init__.py
+++ b/modelscope/pipelines/cv/ocr_utils/__init__.py
@@ -6,12 +6,15 @@ from modelscope.utils.import_utils import LazyImportModule
 if TYPE_CHECKING:
     from .model_resnet_mutex_v4_linewithchar import SegLinkDetector
     from .ops import decode_segments_links_python, combine_segments_python
-    from .utils import rboxes_to_polygons, cal_width, nms_python
+    from .utils import rboxes_to_polygons, cal_width, nms_python, polygons_from_bitmap
 else:
     _import_structure = {
         'model_resnet_mutex_v4_linewithchar': ['SegLinkDetector'],
         'ops': ['decode_segments_links_python', 'combine_segments_python'],
-        'utils': ['rboxes_to_polygons', 'cal_width', 'nms_python']
+        'utils': [
+            'rboxes_to_polygons', 'cal_width', 'nms_python',
+            'polygons_from_bitmap'
+        ]
     }
 
     import sys
diff --git a/modelscope/pipelines/cv/ocr_utils/model_vlpt.py b/modelscope/pipelines/cv/ocr_utils/model_vlpt.py
new file mode 100644
index 00000000..19ac9807
--- /dev/null
+++ b/modelscope/pipelines/cv/ocr_utils/model_vlpt.py
@@ -0,0 +1,431 @@
+# ------------------------------------------------------------------------------
+# Part of implementation is adopted from ViLT,
+# made publicly available under the Apache License 2.0 at https://github.com/dandelin/ViLT.
+# ------------------------------------------------------------------------------
+
+import math
+import os
+import sys
+
+import torch
+import torch.nn as nn
+
+BatchNorm2d = nn.BatchNorm2d
+
+
+def constant_init(module, constant, bias=0):
+    nn.init.constant_(module.weight, constant)
+    if hasattr(module, 'bias'):
+        nn.init.constant_(module.bias, bias)
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    """3x3 convolution with padding"""
+    return nn.Conv2d(
+        in_planes,
+        out_planes,
+        kernel_size=3,
+        stride=stride,
+        padding=1,
+        bias=False)
+
+
+class BasicBlock(nn.Module):
+    expansion = 1
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
+        super(BasicBlock, self).__init__()
+        self.with_dcn = dcn is not None
+        self.conv1 = conv3x3(inplanes, planes, stride)
+        self.bn1 = BatchNorm2d(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.get('fallback_on_stride', False)
+            self.with_modulated_dcn = dcn.get('modulated', False)
+        # self.conv2 = conv3x3(planes, planes)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = nn.Conv2d(
+                planes, planes, kernel_size=3, padding=1, bias=False)
+        else:
+            deformable_groups = dcn.get('deformable_groups', 1)
+            if not self.with_modulated_dcn:
+                from assets.ops.dcn import DeformConv
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                from assets.ops.dcn import ModulatedDeformConv
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+            self.conv2_offset = nn.Conv2d(
+                planes,
+                deformable_groups * offset_channels,
+                kernel_size=3,
+                padding=1)
+            self.conv2 = conv_op(
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                deformable_groups=deformable_groups,
+                bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.downsample = downsample
+        self.stride = stride
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        # out = self.conv2(out)
+        if not self.with_dcn:
+            out = self.conv2(out)
+        elif self.with_modulated_dcn:
+            offset_mask = self.conv2_offset(out)
+            offset = offset_mask[:, :18, :, :]
+            mask = offset_mask[:, -9:, :, :].sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = self.bn2(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1, downsample=None, dcn=None):
+        super(Bottleneck, self).__init__()
+        self.with_dcn = dcn is not None
+        self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm2d(planes)
+        fallback_on_stride = False
+        self.with_modulated_dcn = False
+        if self.with_dcn:
+            fallback_on_stride = dcn.get('fallback_on_stride', False)
+            self.with_modulated_dcn = dcn.get('modulated', False)
+        if not self.with_dcn or fallback_on_stride:
+            self.conv2 = nn.Conv2d(
+                planes,
+                planes,
+                kernel_size=3,
+                stride=stride,
+                padding=1,
+                bias=False)
+        else:
+            deformable_groups = dcn.get('deformable_groups', 1)
+            if not self.with_modulated_dcn:
+                from assets.ops.dcn import DeformConv
+                conv_op = DeformConv
+                offset_channels = 18
+            else:
+                from assets.ops.dcn import ModulatedDeformConv
+                conv_op = ModulatedDeformConv
+                offset_channels = 27
+            self.conv2_offset = nn.Conv2d(
+                planes,
+                deformable_groups * offset_channels,
+                kernel_size=3,
+                padding=1)
+            self.conv2 = conv_op(
+                planes,
+                planes,
+                kernel_size=3,
+                padding=1,
+                stride=stride,
+                deformable_groups=deformable_groups,
+                bias=False)
+        self.bn2 = BatchNorm2d(planes)
+        self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm2d(planes * 4)
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = downsample
+        self.stride = stride
+        self.dcn = dcn
+        self.with_dcn = dcn is not None
+
+    def forward(self, x):
+        residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        # out = self.conv2(out)
+        if not self.with_dcn:
+            out = self.conv2(out)
+        elif self.with_modulated_dcn:
+            offset_mask = self.conv2_offset(out)
+            offset = offset_mask[:, :18, :, :]
+            mask = offset_mask[:, -9:, :, :].sigmoid()
+            out = self.conv2(out, offset, mask)
+        else:
+            offset = self.conv2_offset(out)
+            out = self.conv2(out, offset)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        if self.downsample is not None:
+            residual = self.downsample(x)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class ResNet(nn.Module):
+
+    def __init__(self,
+                 block,
+                 layers,
+                 num_classes=1000,
+                 dcn=None,
+                 stage_with_dcn=(False, False, False, False)):
+        self.dcn = dcn
+        self.stage_with_dcn = stage_with_dcn
+        self.inplanes = 64
+        super(ResNet, self).__init__()
+        self.conv1 = nn.Conv2d(
+            3, 64, kernel_size=7, stride=2, padding=3, bias=False)
+        self.bn1 = BatchNorm2d(64)
+        self.relu = nn.ReLU(inplace=True)
+        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.layer1 = self._make_layer(block, 64, layers[0])
+        self.layer2 = self._make_layer(
+            block, 128, layers[1], stride=2, dcn=dcn)
+        self.layer3 = self._make_layer(
+            block, 256, layers[2], stride=2, dcn=dcn)
+        self.layer4 = self._make_layer(
+            block, 512, layers[3], stride=2, dcn=dcn)
+        # self.avgpool = nn.AvgPool2d(7, stride=1)
+        # self.fc = nn.Linear(512 * block.expansion, num_classes)
+
+        # self.smooth = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=1)
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2. / n))
+            elif isinstance(m, BatchNorm2d):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+        if self.dcn is not None:
+            for m in self.modules():
+                if isinstance(m, Bottleneck) or isinstance(m, BasicBlock):
+                    if hasattr(m, 'conv2_offset'):
+                        constant_init(m.conv2_offset, 0)
+
+    def _make_layer(self, block, planes, blocks, stride=1, dcn=None):
+        downsample = None
+        if stride != 1 or self.inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    self.inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                BatchNorm2d(planes * block.expansion),
+            )
+
+        layers = []
+        layers.append(
+            block(self.inplanes, planes, stride, downsample, dcn=dcn))
+        self.inplanes = planes * block.expansion
+        for i in range(1, blocks):
+            layers.append(block(self.inplanes, planes, dcn=dcn))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        x = self.conv1(x)
+        x = self.bn1(x)
+        x = self.relu(x)
+        x = self.maxpool(x)
+
+        x2 = self.layer1(x)
+        x3 = self.layer2(x2)
+        x4 = self.layer3(x3)
+        x5 = self.layer4(x4)
+
+        return x2, x3, x4, x5
+
+
+class SegDetector(nn.Module):
+
+    def __init__(self,
+                 in_channels=[64, 128, 256, 512],
+                 inner_channels=256,
+                 k=10,
+                 bias=False,
+                 adaptive=False,
+                 smooth=False,
+                 serial=False,
+                 *args,
+                 **kwargs):
+        '''
+        bias: Whether conv layers have bias or not.
+        adaptive: Whether to use adaptive threshold training or not.
+        smooth: If true, use bilinear instead of deconv.
+        serial: If true, thresh prediction will combine segmentation result as input.
+        '''
+        super(SegDetector, self).__init__()
+        self.k = k
+        self.serial = serial
+        self.up5 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up4 = nn.Upsample(scale_factor=2, mode='nearest')
+        self.up3 = nn.Upsample(scale_factor=2, mode='nearest')
+
+        self.in5 = nn.Conv2d(in_channels[-1], inner_channels, 1, bias=bias)
+        self.in4 = nn.Conv2d(in_channels[-2], inner_channels, 1, bias=bias)
+        self.in3 = nn.Conv2d(in_channels[-3], inner_channels, 1, bias=bias)
+        self.in2 = nn.Conv2d(in_channels[-4], inner_channels, 1, bias=bias)
+
+        self.out5 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=8, mode='nearest'))
+        self.out4 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=4, mode='nearest'))
+        self.out3 = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            nn.Upsample(scale_factor=2, mode='nearest'))
+        self.out2 = nn.Conv2d(
+            inner_channels, inner_channels // 4, 3, padding=1, bias=bias)
+
+        self.binarize = nn.Sequential(
+            nn.Conv2d(
+                inner_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            BatchNorm2d(inner_channels // 4), nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(inner_channels // 4, inner_channels // 4, 2, 2),
+            BatchNorm2d(inner_channels // 4), nn.ReLU(inplace=True),
+            nn.ConvTranspose2d(inner_channels // 4, 1, 2, 2), nn.Sigmoid())
+        self.binarize.apply(self.weights_init)
+
+        self.adaptive = adaptive
+        if adaptive:
+            self.thresh = self._init_thresh(
+                inner_channels, serial=serial, smooth=smooth, bias=bias)
+            self.thresh.apply(self.weights_init)
+
+        self.in5.apply(self.weights_init)
+        self.in4.apply(self.weights_init)
+        self.in3.apply(self.weights_init)
+        self.in2.apply(self.weights_init)
+        self.out5.apply(self.weights_init)
+        self.out4.apply(self.weights_init)
+        self.out3.apply(self.weights_init)
+        self.out2.apply(self.weights_init)
+
+    def weights_init(self, m):
+        classname = m.__class__.__name__
+        if classname.find('Conv') != -1:
+            nn.init.kaiming_normal_(m.weight.data)
+        elif classname.find('BatchNorm') != -1:
+            m.weight.data.fill_(1.)
+            m.bias.data.fill_(1e-4)
+
+    def _init_thresh(self,
+                     inner_channels,
+                     serial=False,
+                     smooth=False,
+                     bias=False):
+        in_channels = inner_channels
+        if serial:
+            in_channels += 1
+        self.thresh = nn.Sequential(
+            nn.Conv2d(
+                in_channels, inner_channels // 4, 3, padding=1, bias=bias),
+            BatchNorm2d(inner_channels // 4), nn.ReLU(inplace=True),
+            self._init_upsample(
+                inner_channels // 4,
+                inner_channels // 4,
+                smooth=smooth,
+                bias=bias), BatchNorm2d(inner_channels // 4),
+            nn.ReLU(inplace=True),
+            self._init_upsample(
+                inner_channels // 4, 1, smooth=smooth, bias=bias),
+            nn.Sigmoid())
+        return self.thresh
+
+    def _init_upsample(self,
+                       in_channels,
+                       out_channels,
+                       smooth=False,
+                       bias=False):
+        if smooth:
+            inter_out_channels = out_channels
+            if out_channels == 1:
+                inter_out_channels = in_channels
+            module_list = [
+                nn.Upsample(scale_factor=2, mode='nearest'),
+                nn.Conv2d(in_channels, inter_out_channels, 3, 1, 1, bias=bias)
+            ]
+            if out_channels == 1:
+                module_list.append(
+                    nn.Conv2d(
+                        in_channels,
+                        out_channels,
+                        kernel_size=1,
+                        stride=1,
+                        padding=1,
+                        bias=True))
+
+            return nn.Sequential(module_list)
+        else:
+            return nn.ConvTranspose2d(in_channels, out_channels, 2, 2)
+
+    def forward(self, features, gt=None, masks=None, training=False):
+        c2, c3, c4, c5 = features
+        in5 = self.in5(c5)
+        in4 = self.in4(c4)
+        in3 = self.in3(c3)
+        in2 = self.in2(c2)
+
+        out4 = self.up5(in5) + in4  # 1/16
+        out3 = self.up4(out4) + in3  # 1/8
+        out2 = self.up3(out3) + in2  # 1/4
+
+        p5 = self.out5(in5)
+        p4 = self.out4(out4)
+        p3 = self.out3(out3)
+        p2 = self.out2(out2)
+
+        fuse = torch.cat((p5, p4, p3, p2), 1)
+        # this is the pred module, not binarization module;
+        # We do not correct the name due to the trained model.
+        binary = self.binarize(fuse)
+        return binary
+
+    def step_function(self, x, y):
+        return torch.reciprocal(1 + torch.exp(-self.k * (x - y)))
+
+
+class VLPTModel(nn.Module):
+
+    def __init__(self, *args, **kwargs):
+        super(VLPTModel, self).__init__()
+        self.backbone = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
+        self.decoder = SegDetector(
+            in_channels=[256, 512, 1024, 2048], adaptive=True, k=50, **kwargs)
+
+    def forward(self, x):
+        return self.decoder(self.backbone(x))
diff --git a/modelscope/pipelines/cv/ocr_utils/utils.py b/modelscope/pipelines/cv/ocr_utils/utils.py
index 1d0fb297..b024844d 100644
--- a/modelscope/pipelines/cv/ocr_utils/utils.py
+++ b/modelscope/pipelines/cv/ocr_utils/utils.py
@@ -1,6 +1,8 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 import cv2
 import numpy as np
+import pyclipper
+from shapely.geometry import Polygon
 
 
 def rboxes_to_polygons(rboxes):
@@ -107,3 +109,102 @@ def point_line_dist(px, py, x1, y1, x2, y2):
     div = np.sqrt(dx * dx + dy * dy) + eps
     dist = np.abs(px * dy - py * dx + x2 * y1 - y2 * x1) / div
     return dist
+
+
+# Part of the implementation is borrowed and modified from DB,
+# publicly available at https://github.com/MhLiao/DB.
+def polygons_from_bitmap(pred, _bitmap, dest_width, dest_height):
+    """
+    _bitmap: single map with shape (1, H, W),
+        whose values are binarized as {0, 1}
+    """
+
+    assert _bitmap.size(0) == 1
+    bitmap = _bitmap.cpu().numpy()[0]
+    pred = pred.cpu().detach().numpy()[0]
+    height, width = bitmap.shape
+    boxes = []
+    scores = []
+
+    contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8),
+                                   cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
+
+    for contour in contours[:100]:
+        epsilon = 0.01 * cv2.arcLength(contour, True)
+        approx = cv2.approxPolyDP(contour, epsilon, True)
+        points = approx.reshape((-1, 2))
+        if points.shape[0] < 4:
+            continue
+
+        score = box_score_fast(pred, points.reshape(-1, 2))
+        if 0.7 > score:
+            continue
+
+        if points.shape[0] > 2:
+            box = unclip(points, unclip_ratio=2.0)
+            if len(box) > 1:
+                continue
+        else:
+            continue
+        box = box.reshape(-1, 2)
+        _, sside = get_mini_boxes(box.reshape((-1, 1, 2)))
+        if sside < 3 + 2:
+            continue
+
+        if not isinstance(dest_width, int):
+            dest_width = dest_width.item()
+            dest_height = dest_height.item()
+
+        box[:, 0] = np.clip(
+            np.round(box[:, 0] / width * dest_width), 0, dest_width)
+        box[:, 1] = np.clip(
+            np.round(box[:, 1] / height * dest_height), 0, dest_height)
+        boxes.append(box.tolist())
+        scores.append(score)
+    return boxes, scores
+
+
+def box_score_fast(bitmap, _box):
+    h, w = bitmap.shape[:2]
+    box = _box.copy()
+    xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
+    xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
+    ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
+    ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)
+
+    mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
+    box[:, 0] = box[:, 0] - xmin
+    box[:, 1] = box[:, 1] - ymin
+    cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
+    return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+
+
+def unclip(box, unclip_ratio=1.5):
+    poly = Polygon(box)
+    distance = poly.area * unclip_ratio / poly.length
+    offset = pyclipper.PyclipperOffset()
+    offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
+    expanded = np.array(offset.Execute(distance))
+    return expanded
+
+
+def get_mini_boxes(contour):
+    bounding_box = cv2.minAreaRect(contour)
+    points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
+
+    index_1, index_2, index_3, index_4 = 0, 1, 2, 3
+    if points[1][1] > points[0][1]:
+        index_1 = 0
+        index_4 = 1
+    else:
+        index_1 = 1
+        index_4 = 0
+    if points[3][1] > points[2][1]:
+        index_2 = 2
+        index_3 = 3
+    else:
+        index_2 = 3
+        index_3 = 2
+
+    box = [points[index_1], points[index_2], points[index_3], points[index_4]]
+    return box, min(bounding_box[1])
diff --git a/tests/pipelines/test_ocr_detection.py b/tests/pipelines/test_ocr_detection.py
index e0591496..f1c20f47 100644
--- a/tests/pipelines/test_ocr_detection.py
+++ b/tests/pipelines/test_ocr_detection.py
@@ -12,7 +12,9 @@ class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
 
     def setUp(self) -> None:
         self.model_id = 'damo/cv_resnet18_ocr-detection-line-level_damo'
+        self.model_id_vlpt = 'damo/cv_resnet50_ocr-detection-vlpt'
         self.test_image = 'data/test/images/ocr_detection.jpg'
+        self.test_image_vlpt = 'data/test/images/ocr_detection_vlpt.jpg'
         self.task = Tasks.ocr_detection
 
     def pipeline_inference(self, pipeline: Pipeline, input_location: str):
@@ -25,6 +27,11 @@ class OCRDetectionTest(unittest.TestCase, DemoCompatibilityCheck):
         ocr_detection = pipeline(Tasks.ocr_detection, model=self.model_id)
         self.pipeline_inference(ocr_detection, self.test_image)
 
+    @unittest.skipUnless(test_level() >= 0, 'skip test in current test level')
+    def test_run_with_vlpt_with_model_from_modelhub(self):
+        ocr_detection = pipeline(Tasks.ocr_detection, model=self.model_id_vlpt)
+        self.pipeline_inference(ocr_detection, self.test_image_vlpt)
+
     @unittest.skipUnless(test_level() >= 2, 'skip test in current test level')
     def test_run_modelhub_default_model(self):
         ocr_detection = pipeline(Tasks.ocr_detection)