diff --git a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py index f1b1a6c7..0cc040c6 100644 --- a/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py +++ b/modelscope/models/multi_modal/mmr/models/clip_for_mm_video_embedding.py @@ -236,8 +236,10 @@ class VideoCLIPForMultiModalEmbedding(TorchModel): logger.info('text feature: {}'.format(sequence_output[0][0][0])) logger.info('video feature: {}'.format(visual_output[0][0][0])) - output[OutputKeys.VIDEO_EMBEDDING] = visual_output - output[OutputKeys.TEXT_EMBEDDING] = sequence_output + output[ + OutputKeys.VIDEO_EMBEDDING] = visual_output.cpu().detach().numpy() + output[OutputKeys.TEXT_EMBEDDING] = sequence_output.cpu().detach( + ).numpy() return output def postprocess(self, inputs: Dict[str, Any]) -> Dict[str, Any]: