From a272d00c540f1c37c55fb5f97a576bc46f33ad24 Mon Sep 17 00:00:00 2001
From: "jinmao.yk" <jinmao.yk@alibaba-inc.com>
Date: Tue, 31 Jan 2023 14:34:05 +0000
Subject: [PATCH] adjust video_human_matting output of video to support demo
 service

Link: https://code.alibaba-inc.com/Ali-MaaS/MaaS-lib/codereview/11486472
---
 modelscope/outputs/outputs.py                    |  3 ++-
 .../pipelines/cv/video_human_matting_pipeline.py | 16 +++++++++-------
 2 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/modelscope/outputs/outputs.py b/modelscope/outputs/outputs.py
index dec5084f..76a5c779 100644
--- a/modelscope/outputs/outputs.py
+++ b/modelscope/outputs/outputs.py
@@ -534,8 +534,9 @@ TASK_OUTPUTS = {
     # video human matting result for a single video
     #   {
     #       "masks": [np.array # 2D array with shape [height, width]]
+    #       "output_video": "path_to_matting_video"
     #   }
-    Tasks.video_human_matting: [OutputKeys.MASKS],
+    Tasks.video_human_matting: [OutputKeys.MASKS, OutputKeys.OUTPUT_VIDEO],
 
     # ============ nlp tasks ===================
 
diff --git a/modelscope/pipelines/cv/video_human_matting_pipeline.py b/modelscope/pipelines/cv/video_human_matting_pipeline.py
index e9a05d84..a9035dd3 100644
--- a/modelscope/pipelines/cv/video_human_matting_pipeline.py
+++ b/modelscope/pipelines/cv/video_human_matting_pipeline.py
@@ -37,9 +37,11 @@ class VideoHumanMattingPipeline(Pipeline):
     def preprocess(self, input) -> Input:
         return input
 
-    def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
+    def forward(self, input: Dict[str, Any],
+                **forward_params) -> Dict[str, Any]:
         video_path = input['video_input_path']
         out_path = input['output_path']
+        render = forward_params.get('render', False)
         video_input = cv2.VideoCapture(video_path)
         fps = video_input.get(cv2.CAP_PROP_FPS)
         fourcc = cv2.VideoWriter_fourcc(*'mp4v')
@@ -58,19 +60,19 @@ class VideoHumanMattingPipeline(Pipeline):
                 frame_tensor = preprocess(frame)
                 pha, *rec = self.model.model(
                     frame_tensor.to(self.device), *rec, downsample_ratio=scale)
-                com = pha * 255
-                com = com.repeat(1, 3, 1, 1)
-                com = com[0].data.cpu().numpy().transpose(1, 2,
-                                                          0).astype(np.uint8)
+                mask = pha * 255
+                mask = mask[0].data.cpu().numpy().transpose(1, 2, 0)
+                com = mask.repeat(3, 2).astype(np.uint8)
                 video_save.write(com)
-                masks.append(com / 255)
+                masks.append((mask / 255).astype(np.uint8))
                 success, frame = video_input.read()
         logger.info('matting process done')
         video_input.release()
         video_save.release()
 
         return {
-            OutputKeys.MASKS: masks,
+            OutputKeys.MASKS: None if render else masks,
+            OutputKeys.OUTPUT_VIDEO: out_path
         }
 
     def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: