mirror of
https://github.com/modelscope/modelscope.git
synced 2025-12-24 03:59:23 +01:00
Merge remote-tracking branch 'origin/master' into ofa/finetune
# Conflicts: # modelscope/metrics/__init__.py
This commit is contained in:
3
data/test/audios/noise_2ch.wav
Normal file
3
data/test/audios/noise_2ch.wav
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:e8d653a9a1ee49789c3df38e8da96af7118e0d8336d6ed12cd6458efa015071d
|
||||
size 2327764
|
||||
3
data/test/audios/wake_word_with_label_xyxy.wav
Normal file
3
data/test/audios/wake_word_with_label_xyxy.wav
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:c589d77404ea17d4d24daeb8624dce7e1ac919dc75e6bed44ea9d116f0514150
|
||||
size 68524
|
||||
3
data/test/images/auto_demo.jpg
Normal file
3
data/test/images/auto_demo.jpg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:76bf84536edbaf192a8a699efc62ba2b06056bac12c426ecfcc2e003d91fbd32
|
||||
size 53219
|
||||
3
data/test/images/card_detection.jpg
Normal file
3
data/test/images/card_detection.jpg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:ecbc9d0827cfb92e93e7d75868b1724142685dc20d3b32023c3c657a7b688a9c
|
||||
size 254845
|
||||
3
data/test/images/face_detection2.jpeg
Normal file
3
data/test/images/face_detection2.jpeg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:d510ab26ddc58ffea882c8ef850c1f9bd4444772f2bce7ebea3e76944536c3ae
|
||||
size 48909
|
||||
3
data/test/images/image_body_reshaping.jpg
Normal file
3
data/test/images/image_body_reshaping.jpg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:b2c1119e3d521cf2e583b1e85fc9c9afd1d44954b433135039a98050a730932d
|
||||
size 1127557
|
||||
3
data/test/images/image_inpainting/image_inpainting.png
Normal file
3
data/test/images/image_inpainting/image_inpainting.png
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:46db348eae61448f1668ce282caec21375e96c3268d53da44aa67ec32cbf4fa5
|
||||
size 2747938
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:709c1828ed2d56badf2f19a40194da9a5e5e6db2fb73ef55d047407f49bc7a15
|
||||
size 27616
|
||||
@@ -1,3 +0,0 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:379e11d7fc3734d3ec95afd0d86460b4653fbf4bb1f57f993610d6a6fd30fd3d
|
||||
size 1702339
|
||||
3
data/test/images/keypoints_detect/img_test_wholebody.jpg
Normal file
3
data/test/images/keypoints_detect/img_test_wholebody.jpg
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:dec0fbb931cb609bf481e56b89cd2fbbab79839f22832c3bbe69a8fae2769cdd
|
||||
size 167407
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
|
||||
size 60801
|
||||
oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c
|
||||
size 61239
|
||||
|
||||
@@ -1,3 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
|
||||
size 60801
|
||||
oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1
|
||||
size 61115
|
||||
|
||||
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:a49c9bc74a60860c360a4bf4509fe9db915279aaabd953f354f2c38e9be1e6cb
|
||||
size 2924691
|
||||
3
data/test/videos/test_realtime_vod.mp4
Normal file
3
data/test/videos/test_realtime_vod.mp4
Normal file
@@ -0,0 +1,3 @@
|
||||
version https://git-lfs.github.com/spec/v1
|
||||
oid sha256:f58df1d25590c158ae0a04b3999bd44b610cdaddb17d78afd84c34b3f00d4e87
|
||||
size 4068783
|
||||
@@ -76,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
|
||||
ENV SHELL=/bin/bash
|
||||
|
||||
# install special package
|
||||
RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq
|
||||
RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl
|
||||
|
||||
RUN if [ "$USE_GPU" = "True" ] ; then \
|
||||
pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \
|
||||
|
||||
@@ -24,20 +24,17 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
|
||||
DownloadMode)
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .errors import (InvalidParameter, NotExistError, RequestError,
|
||||
datahub_raise_on_error, handle_http_response, is_ok,
|
||||
raise_on_error)
|
||||
from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
|
||||
model_id_to_group_owner_name)
|
||||
datahub_raise_on_error, handle_http_post_error,
|
||||
handle_http_response, is_ok, raise_on_error)
|
||||
from .utils.utils import get_endpoint, model_id_to_group_owner_name
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class HubApi:
|
||||
|
||||
def __init__(self, endpoint=None, dataset_endpoint=None):
|
||||
def __init__(self, endpoint=None):
|
||||
self.endpoint = endpoint if endpoint is not None else get_endpoint()
|
||||
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
|
||||
)
|
||||
|
||||
def login(
|
||||
self,
|
||||
@@ -105,17 +102,15 @@ class HubApi:
|
||||
|
||||
path = f'{self.endpoint}/api/v1/models'
|
||||
owner_or_group, name = model_id_to_group_owner_name(model_id)
|
||||
r = requests.post(
|
||||
path,
|
||||
json={
|
||||
'Path': owner_or_group,
|
||||
'Name': name,
|
||||
'ChineseName': chinese_name,
|
||||
'Visibility': visibility, # server check
|
||||
'License': license
|
||||
},
|
||||
cookies=cookies)
|
||||
r.raise_for_status()
|
||||
body = {
|
||||
'Path': owner_or_group,
|
||||
'Name': name,
|
||||
'ChineseName': chinese_name,
|
||||
'Visibility': visibility, # server check
|
||||
'License': license
|
||||
}
|
||||
r = requests.post(path, json=body, cookies=cookies)
|
||||
handle_http_post_error(r, path, body)
|
||||
raise_on_error(r.json())
|
||||
model_repo_url = f'{get_endpoint()}/{model_id}'
|
||||
return model_repo_url
|
||||
@@ -290,7 +285,7 @@ class HubApi:
|
||||
return files
|
||||
|
||||
def list_datasets(self):
|
||||
path = f'{self.dataset_endpoint}/api/v1/datasets'
|
||||
path = f'{self.endpoint}/api/v1/datasets'
|
||||
headers = None
|
||||
params = {}
|
||||
r = requests.get(path, params=params, headers=headers)
|
||||
@@ -317,13 +312,13 @@ class HubApi:
|
||||
cache_dir):
|
||||
shutil.rmtree(cache_dir)
|
||||
os.makedirs(cache_dir, exist_ok=True)
|
||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
|
||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
|
||||
r = requests.get(datahub_url)
|
||||
resp = r.json()
|
||||
datahub_raise_on_error(datahub_url, resp)
|
||||
dataset_id = resp['Data']['Id']
|
||||
dataset_type = resp['Data']['Type']
|
||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
|
||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
|
||||
r = requests.get(datahub_url)
|
||||
resp = r.json()
|
||||
datahub_raise_on_error(datahub_url, resp)
|
||||
@@ -341,7 +336,7 @@ class HubApi:
|
||||
file_path = file_info['Path']
|
||||
extension = os.path.splitext(file_path)[-1]
|
||||
if extension in dataset_meta_format:
|
||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
|
||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
|
||||
f'Revision={revision}&FilePath={file_path}'
|
||||
r = requests.get(datahub_url)
|
||||
r.raise_for_status()
|
||||
@@ -365,7 +360,7 @@ class HubApi:
|
||||
namespace: str,
|
||||
revision: Optional[str] = DEFAULT_DATASET_REVISION):
|
||||
if file_name.endswith('.csv'):
|
||||
file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
|
||||
file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
|
||||
f'Revision={revision}&FilePath={file_name}'
|
||||
return file_name
|
||||
|
||||
@@ -374,7 +369,7 @@ class HubApi:
|
||||
dataset_name: str,
|
||||
namespace: str,
|
||||
revision: Optional[str] = DEFAULT_DATASET_REVISION):
|
||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
|
||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
|
||||
f'ststoken?Revision={revision}'
|
||||
return self.datahub_remote_call(datahub_url)
|
||||
|
||||
@@ -385,7 +380,7 @@ class HubApi:
|
||||
namespace: str,
|
||||
revision: Optional[str] = DEFAULT_DATASET_REVISION):
|
||||
|
||||
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
|
||||
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
|
||||
f'ststoken?Revision={revision}'
|
||||
|
||||
cookies = requests.utils.dict_from_cookiejar(cookies)
|
||||
@@ -394,6 +389,19 @@ class HubApi:
|
||||
raise_on_error(resp)
|
||||
return resp['Data']
|
||||
|
||||
def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
|
||||
is_recursive, is_filter_dir, revision,
|
||||
cookies):
|
||||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
|
||||
f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'
|
||||
cookies = requests.utils.dict_from_cookiejar(cookies)
|
||||
|
||||
resp = requests.get(url=url, cookies=cookies)
|
||||
resp = resp.json()
|
||||
raise_on_error(resp)
|
||||
resp = resp['Data']
|
||||
return resp
|
||||
|
||||
def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
|
||||
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
|
||||
r = requests.post(url)
|
||||
|
||||
@@ -4,6 +4,10 @@ from http import HTTPStatus
|
||||
|
||||
from requests.exceptions import HTTPError
|
||||
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
class NotExistError(Exception):
|
||||
pass
|
||||
@@ -45,15 +49,24 @@ def is_ok(rsp):
|
||||
return rsp['Code'] == HTTPStatus.OK and rsp['Success']
|
||||
|
||||
|
||||
def handle_http_post_error(response, url, request_body):
|
||||
try:
|
||||
response.raise_for_status()
|
||||
except HTTPError as error:
|
||||
logger.error('Request %s with body: %s exception' %
|
||||
(url, request_body))
|
||||
raise error
|
||||
|
||||
|
||||
def handle_http_response(response, logger, cookies, model_id):
|
||||
try:
|
||||
response.raise_for_status()
|
||||
except HTTPError:
|
||||
except HTTPError as error:
|
||||
if cookies is None: # code in [403] and
|
||||
logger.error(
|
||||
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
|
||||
private. Please login first.')
|
||||
raise
|
||||
raise error
|
||||
|
||||
|
||||
def raise_on_error(rsp):
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
import re
|
||||
import subprocess
|
||||
from typing import List
|
||||
from xmlrpc.client import Boolean
|
||||
@@ -138,8 +139,8 @@ class GitCommandWrapper(metaclass=Singleton):
|
||||
repo_base_dir, repo_name, user_name)
|
||||
response = self._run_git_command(*config_user_name_args.split(' '))
|
||||
logger.debug(response.stdout.decode('utf8'))
|
||||
config_user_email_args = '-C %s/%s config user.name %s' % (
|
||||
repo_base_dir, repo_name, user_name)
|
||||
config_user_email_args = '-C %s/%s config user.email %s' % (
|
||||
repo_base_dir, repo_name, user_email)
|
||||
response = self._run_git_command(
|
||||
*config_user_email_args.split(' '))
|
||||
logger.debug(response.stdout.decode('utf8'))
|
||||
@@ -177,6 +178,15 @@ class GitCommandWrapper(metaclass=Singleton):
|
||||
cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision]
|
||||
return self._run_git_command(*cmds)
|
||||
|
||||
def get_remote_branches(self, repo_dir: str):
|
||||
cmds = ['-C', '%s' % repo_dir, 'branch', '-r']
|
||||
rsp = self._run_git_command(*cmds)
|
||||
info = [
|
||||
line.strip()
|
||||
for line in rsp.stdout.decode('utf8').strip().split(os.linesep)
|
||||
][1:]
|
||||
return ['/'.join(line.split('/')[1:]) for line in info]
|
||||
|
||||
def pull(self, repo_dir: str):
|
||||
cmds = ['-C', repo_dir, 'pull']
|
||||
return self._run_git_command(*cmds)
|
||||
|
||||
117
modelscope/hub/upload.py
Normal file
117
modelscope/hub/upload.py
Normal file
@@ -0,0 +1,117 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import datetime
|
||||
import os
|
||||
import shutil
|
||||
import tempfile
|
||||
import uuid
|
||||
from typing import Dict, Optional
|
||||
from uuid import uuid4
|
||||
|
||||
from filelock import FileLock
|
||||
|
||||
from modelscope import __version__
|
||||
from modelscope.hub.api import HubApi, ModelScopeConfig
|
||||
from modelscope.hub.errors import InvalidParameter, NotLoginException
|
||||
from modelscope.hub.git import GitCommandWrapper
|
||||
from modelscope.hub.repository import Repository
|
||||
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
|
||||
def upload_folder(model_id: str,
|
||||
model_dir: str,
|
||||
visibility: int = 0,
|
||||
license: str = None,
|
||||
chinese_name: Optional[str] = None,
|
||||
commit_message: Optional[str] = None,
|
||||
revision: Optional[str] = DEFAULT_MODEL_REVISION):
|
||||
"""
|
||||
Upload model from a given directory to given repository. A valid model directory
|
||||
must contain a configuration.json file.
|
||||
|
||||
This function upload the files in given directory to given repository. If the
|
||||
given repository is not exists in remote, it will automatically create it with
|
||||
given visibility, license and chinese_name parameters. If the revision is also
|
||||
not exists in remote repository, it will create a new branch for it.
|
||||
|
||||
This function must be called before calling HubApi's login with a valid token
|
||||
which can be obtained from ModelScope's website.
|
||||
|
||||
Args:
|
||||
model_id (`str`):
|
||||
The model id to be uploaded, caller must have write permission for it.
|
||||
model_dir(`str`):
|
||||
The Absolute Path of the finetune result.
|
||||
visibility(`int`, defaults to `0`):
|
||||
Visibility of the new created model(1-private, 5-public). If the model is
|
||||
not exists in ModelScope, this function will create a new model with this
|
||||
visibility and this parameter is required. You can ignore this parameter
|
||||
if you make sure the model's existence.
|
||||
license(`str`, defaults to `None`):
|
||||
License of the new created model(see License). If the model is not exists
|
||||
in ModelScope, this function will create a new model with this license
|
||||
and this parameter is required. You can ignore this parameter if you
|
||||
make sure the model's existence.
|
||||
chinese_name(`str`, *optional*, defaults to `None`):
|
||||
chinese name of the new created model.
|
||||
commit_message(`str`, *optional*, defaults to `None`):
|
||||
commit message of the push request.
|
||||
revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
|
||||
which branch to push. If the branch is not exists, It will create a new
|
||||
branch and push to it.
|
||||
"""
|
||||
if model_id is None:
|
||||
raise InvalidParameter('model_id cannot be empty!')
|
||||
if model_dir is None:
|
||||
raise InvalidParameter('model_dir cannot be empty!')
|
||||
if not os.path.exists(model_dir) or os.path.isfile(model_dir):
|
||||
raise InvalidParameter('model_dir must be a valid directory.')
|
||||
cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
|
||||
if not os.path.exists(cfg_file):
|
||||
raise ValueError(f'{model_dir} must contain a configuration.json.')
|
||||
cookies = ModelScopeConfig.get_cookies()
|
||||
if cookies is None:
|
||||
raise NotLoginException('Must login before upload!')
|
||||
files_to_save = os.listdir(model_dir)
|
||||
api = HubApi()
|
||||
try:
|
||||
api.get_model(model_id=model_id)
|
||||
except Exception:
|
||||
if visibility is None or license is None:
|
||||
raise InvalidParameter(
|
||||
'visibility and license cannot be empty if want to create new repo'
|
||||
)
|
||||
logger.info('Create new model %s' % model_id)
|
||||
api.create_model(
|
||||
model_id=model_id,
|
||||
visibility=visibility,
|
||||
license=license,
|
||||
chinese_name=chinese_name)
|
||||
tmp_dir = tempfile.mkdtemp()
|
||||
git_wrapper = GitCommandWrapper()
|
||||
try:
|
||||
repo = Repository(model_dir=tmp_dir, clone_from=model_id)
|
||||
branches = git_wrapper.get_remote_branches(tmp_dir)
|
||||
if revision not in branches:
|
||||
logger.info('Create new branch %s' % revision)
|
||||
git_wrapper.new_branch(tmp_dir, revision)
|
||||
git_wrapper.checkout(tmp_dir, revision)
|
||||
for f in files_to_save:
|
||||
if f[0] != '.':
|
||||
src = os.path.join(model_dir, f)
|
||||
if os.path.isdir(src):
|
||||
shutil.copytree(src, os.path.join(tmp_dir, f))
|
||||
else:
|
||||
shutil.copy(src, tmp_dir)
|
||||
if not commit_message:
|
||||
date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
|
||||
commit_message = '[automsg] push model %s to hub at %s' % (
|
||||
model_id, date)
|
||||
repo.push(commit_message=commit_message, branch=revision)
|
||||
except Exception:
|
||||
raise
|
||||
finally:
|
||||
shutil.rmtree(tmp_dir, ignore_errors=True)
|
||||
@@ -4,8 +4,7 @@ import hashlib
|
||||
import os
|
||||
from typing import Optional
|
||||
|
||||
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
|
||||
DEFAULT_MODELSCOPE_DOMAIN,
|
||||
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
|
||||
DEFAULT_MODELSCOPE_GROUP,
|
||||
MODEL_ID_SEPARATOR,
|
||||
MODELSCOPE_URL_SCHEME)
|
||||
@@ -44,11 +43,6 @@ def get_endpoint():
|
||||
return MODELSCOPE_URL_SCHEME + modelscope_domain
|
||||
|
||||
|
||||
def get_dataset_hub_endpoint():
|
||||
return os.environ.get('HUB_DATASET_ENDPOINT',
|
||||
DEFAULT_MODELSCOPE_DATA_ENDPOINT)
|
||||
|
||||
|
||||
def compute_hash(file_path):
|
||||
BUFFER_SIZE = 1024 * 64 # 64k buffer size
|
||||
sha256_hash = hashlib.sha256()
|
||||
|
||||
@@ -14,6 +14,7 @@ class Models(object):
|
||||
# vision models
|
||||
detection = 'detection'
|
||||
realtime_object_detection = 'realtime-object-detection'
|
||||
realtime_video_object_detection = 'realtime-video-object-detection'
|
||||
scrfd = 'scrfd'
|
||||
classification_model = 'ClassificationModel'
|
||||
nafnet = 'nafnet'
|
||||
@@ -27,11 +28,13 @@ class Models(object):
|
||||
face_2d_keypoints = 'face-2d-keypoints'
|
||||
panoptic_segmentation = 'swinL-panoptic-segmentation'
|
||||
image_reid_person = 'passvitb'
|
||||
image_inpainting = 'FFTInpainting'
|
||||
video_summarization = 'pgl-video-summarization'
|
||||
swinL_semantic_segmentation = 'swinL-semantic-segmentation'
|
||||
vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
|
||||
text_driven_segmentation = 'text-driven-segmentation'
|
||||
resnet50_bert = 'resnet50-bert'
|
||||
referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
|
||||
fer = 'fer'
|
||||
retinaface = 'retinaface'
|
||||
shop_segmentation = 'shop-segmentation'
|
||||
@@ -39,14 +42,18 @@ class Models(object):
|
||||
mtcnn = 'mtcnn'
|
||||
ulfd = 'ulfd'
|
||||
video_inpainting = 'video-inpainting'
|
||||
human_wholebody_keypoint = 'human-wholebody-keypoint'
|
||||
hand_static = 'hand-static'
|
||||
face_human_hand_detection = 'face-human-hand-detection'
|
||||
face_emotion = 'face-emotion'
|
||||
product_segmentation = 'product-segmentation'
|
||||
image_body_reshaping = 'image-body-reshaping'
|
||||
|
||||
# EasyCV models
|
||||
yolox = 'YOLOX'
|
||||
segformer = 'Segformer'
|
||||
hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
|
||||
image_object_detection_auto = 'image-object-detection-auto'
|
||||
|
||||
# nlp models
|
||||
bert = 'bert'
|
||||
@@ -66,6 +73,7 @@ class Models(object):
|
||||
gcnncrf = 'gcnn-crf'
|
||||
bart = 'bart'
|
||||
gpt3 = 'gpt3'
|
||||
gpt_neo = 'gpt-neo'
|
||||
plug = 'plug'
|
||||
bert_for_ds = 'bert-for-document-segmentation'
|
||||
ponet = 'ponet'
|
||||
@@ -96,6 +104,7 @@ class TaskModels(object):
|
||||
information_extraction = 'information-extraction'
|
||||
fill_mask = 'fill-mask'
|
||||
feature_extraction = 'feature-extraction'
|
||||
text_generation = 'text-generation'
|
||||
|
||||
|
||||
class Heads(object):
|
||||
@@ -111,6 +120,8 @@ class Heads(object):
|
||||
token_classification = 'token-classification'
|
||||
# extraction
|
||||
information_extraction = 'information-extraction'
|
||||
# text gen
|
||||
text_generation = 'text-generation'
|
||||
|
||||
|
||||
class Pipelines(object):
|
||||
@@ -144,6 +155,7 @@ class Pipelines(object):
|
||||
salient_detection = 'u2net-salient-detection'
|
||||
image_classification = 'image-classification'
|
||||
face_detection = 'resnet-face-detection-scrfd10gkps'
|
||||
card_detection = 'resnet-card-detection-scrfd34gkps'
|
||||
ulfd_face_detection = 'manual-face-detection-ulfd'
|
||||
facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
|
||||
retina_face_detection = 'resnet50-face-detection-retinaface'
|
||||
@@ -160,6 +172,7 @@ class Pipelines(object):
|
||||
face_image_generation = 'gan-face-image-generation'
|
||||
product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
|
||||
realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
|
||||
realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
|
||||
face_recognition = 'ir101-face-recognition-cfglint'
|
||||
image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
|
||||
image2image_translation = 'image-to-image-translation'
|
||||
@@ -168,6 +181,7 @@ class Pipelines(object):
|
||||
ocr_recognition = 'convnextTiny-ocr-recognition'
|
||||
image_portrait_enhancement = 'gpen-image-portrait-enhancement'
|
||||
image_to_image_generation = 'image-to-image-generation'
|
||||
image_object_detection_auto = 'yolox_image-object-detection-auto'
|
||||
skin_retouching = 'unet-skin-retouching'
|
||||
tinynas_classification = 'tinynas-classification'
|
||||
tinynas_detection = 'tinynas-detection'
|
||||
@@ -178,15 +192,19 @@ class Pipelines(object):
|
||||
video_summarization = 'googlenet_pgl_video_summarization'
|
||||
image_semantic_segmentation = 'image-semantic-segmentation'
|
||||
image_reid_person = 'passvitb-image-reid-person'
|
||||
image_inpainting = 'fft-inpainting'
|
||||
text_driven_segmentation = 'text-driven-segmentation'
|
||||
movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
|
||||
shop_segmentation = 'shop-segmentation'
|
||||
video_inpainting = 'video-inpainting'
|
||||
human_wholebody_keypoint = 'hrnetw48_human-wholebody-keypoint_image'
|
||||
pst_action_recognition = 'patchshift-action-recognition'
|
||||
hand_static = 'hand-static'
|
||||
face_human_hand_detection = 'face-human-hand-detection'
|
||||
face_emotion = 'face-emotion'
|
||||
product_segmentation = 'product-segmentation'
|
||||
image_body_reshaping = 'flow-based-body-reshaping'
|
||||
referring_video_object_segmentation = 'referring-video-object-segmentation'
|
||||
|
||||
# nlp tasks
|
||||
automatic_post_editing = 'automatic-post-editing'
|
||||
@@ -211,6 +229,7 @@ class Pipelines(object):
|
||||
zero_shot_classification = 'zero-shot-classification'
|
||||
text_error_correction = 'text-error-correction'
|
||||
plug_generation = 'plug-generation'
|
||||
gpt3_generation = 'gpt3-generation'
|
||||
faq_question_answering = 'faq-question-answering'
|
||||
conversational_text_to_sql = 'conversational-text-to-sql'
|
||||
table_question_answering_pipeline = 'table-question-answering-pipeline'
|
||||
@@ -219,6 +238,9 @@ class Pipelines(object):
|
||||
relation_extraction = 'relation-extraction'
|
||||
document_segmentation = 'document-segmentation'
|
||||
feature_extraction = 'feature-extraction'
|
||||
translation_en_to_de = 'translation_en_to_de' # keep it underscore
|
||||
translation_en_to_ro = 'translation_en_to_ro' # keep it underscore
|
||||
translation_en_to_fr = 'translation_en_to_fr' # keep it underscore
|
||||
|
||||
# audio tasks
|
||||
sambert_hifigan_tts = 'sambert-hifigan-tts'
|
||||
@@ -263,6 +285,9 @@ class Trainers(object):
|
||||
image_portrait_enhancement = 'image-portrait-enhancement'
|
||||
video_summarization = 'video-summarization'
|
||||
movie_scene_segmentation = 'movie-scene-segmentation'
|
||||
face_detection_scrfd = 'face-detection-scrfd'
|
||||
card_detection_scrfd = 'card-detection-scrfd'
|
||||
image_inpainting = 'image-inpainting'
|
||||
|
||||
# nlp trainers
|
||||
bert_sentiment_analysis = 'bert-sentiment-analysis'
|
||||
@@ -274,6 +299,7 @@ class Trainers(object):
|
||||
|
||||
# audio trainers
|
||||
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
|
||||
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
|
||||
|
||||
|
||||
class Preprocessors(object):
|
||||
@@ -302,6 +328,8 @@ class Preprocessors(object):
|
||||
bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
|
||||
text_gen_tokenizer = 'text-gen-tokenizer'
|
||||
text2text_gen_preprocessor = 'text2text-gen-preprocessor'
|
||||
text_gen_jieba_tokenizer = 'text-gen-jieba-tokenizer'
|
||||
text2text_translate_preprocessor = 'text2text-translate-preprocessor'
|
||||
token_cls_tokenizer = 'token-cls-tokenizer'
|
||||
ner_tokenizer = 'ner-tokenizer'
|
||||
nli_tokenizer = 'nli-tokenizer'
|
||||
@@ -324,6 +352,7 @@ class Preprocessors(object):
|
||||
re_tokenizer = 're-tokenizer'
|
||||
document_segmentation = 'document-segmentation'
|
||||
feature_extraction = 'feature-extraction'
|
||||
sentence_piece = 'sentence-piece'
|
||||
|
||||
# audio preprocessor
|
||||
linear_aec_fbank = 'linear-aec-fbank'
|
||||
@@ -365,6 +394,8 @@ class Metrics(object):
|
||||
video_summarization_metric = 'video-summarization-metric'
|
||||
# metric for movie-scene-segmentation task
|
||||
movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
|
||||
# metric for inpainting task
|
||||
image_inpainting_metric = 'image-inpainting-metric'
|
||||
|
||||
|
||||
class Optimizers(object):
|
||||
@@ -406,6 +437,9 @@ class Hooks(object):
|
||||
IterTimerHook = 'IterTimerHook'
|
||||
EvaluationHook = 'EvaluationHook'
|
||||
|
||||
# Compression
|
||||
SparsityHook = 'SparsityHook'
|
||||
|
||||
|
||||
class LR_Schedulers(object):
|
||||
"""learning rate scheduler is defined here
|
||||
@@ -421,6 +455,8 @@ class Datasets(object):
|
||||
"""
|
||||
ClsDataset = 'ClsDataset'
|
||||
Face2dKeypointsDataset = 'Face2dKeypointsDataset'
|
||||
HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
|
||||
HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset'
|
||||
SegDataset = 'SegDataset'
|
||||
DetDataset = 'DetDataset'
|
||||
DetImagesMixDataset = 'DetImagesMixDataset'
|
||||
|
||||
@@ -19,6 +19,7 @@ if TYPE_CHECKING:
|
||||
from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
|
||||
from .accuracy_metric import AccuracyMetric
|
||||
from .bleu_metric import BleuMetric
|
||||
from .image_inpainting_metric import ImageInpaintingMetric
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
@@ -36,6 +37,7 @@ else:
|
||||
'token_classification_metric': ['TokenClassificationMetric'],
|
||||
'video_summarization_metric': ['VideoSummarizationMetric'],
|
||||
'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
|
||||
'image_inpainting_metric': ['ImageInpaintingMetric'],
|
||||
'accuracy_metric': ['AccuracyMetric'],
|
||||
'bleu_metric': ['BleuMetric'],
|
||||
}
|
||||
|
||||
@@ -35,6 +35,8 @@ class AudioNoiseMetric(Metric):
|
||||
total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
|
||||
return {
|
||||
'total_loss': total_loss.item(),
|
||||
'avg_sisnr': avg_sisnr.item(),
|
||||
# model use opposite number of sisnr as a calculation shortcut.
|
||||
# revert it in evaluation result
|
||||
'avg_sisnr': -avg_sisnr.item(),
|
||||
MetricKeys.AVERAGE_LOSS: avg_loss.item()
|
||||
}
|
||||
|
||||
@@ -18,6 +18,7 @@ class MetricKeys(object):
|
||||
SSIM = 'ssim'
|
||||
AVERAGE_LOSS = 'avg_loss'
|
||||
FScore = 'fscore'
|
||||
FID = 'fid'
|
||||
BLEU_1 = 'bleu-1'
|
||||
BLEU_4 = 'bleu-4'
|
||||
ROUGE_1 = 'rouge-1'
|
||||
@@ -39,6 +40,7 @@ task_default_metrics = {
|
||||
Tasks.image_captioning: [Metrics.text_gen_metric],
|
||||
Tasks.visual_question_answering: [Metrics.text_gen_metric],
|
||||
Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
|
||||
Tasks.image_inpainting: [Metrics.image_inpainting_metric],
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
# ------------------------------------------------------------------------
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
# ------------------------------------------------------------------------
|
||||
# modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/metrics/psnr_ssim.py
|
||||
# ------------------------------------------------------------------------
|
||||
from typing import Dict
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
from skimage.metrics import peak_signal_noise_ratio, structural_similarity
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Metrics
|
||||
from modelscope.utils.registry import default_group
|
||||
from modelscope.utils.tensor_utils import (torch_nested_detach,
|
||||
torch_nested_numpify)
|
||||
from .base import Metric
|
||||
from .builder import METRICS, MetricKeys
|
||||
|
||||
@@ -20,26 +24,249 @@ class ImageDenoiseMetric(Metric):
|
||||
label_name = 'target'
|
||||
|
||||
def __init__(self):
|
||||
super(ImageDenoiseMetric, self).__init__()
|
||||
self.preds = []
|
||||
self.labels = []
|
||||
|
||||
def add(self, outputs: Dict, inputs: Dict):
|
||||
ground_truths = outputs[ImageDenoiseMetric.label_name]
|
||||
eval_results = outputs[ImageDenoiseMetric.pred_name]
|
||||
self.preds.append(
|
||||
torch_nested_numpify(torch_nested_detach(eval_results)))
|
||||
self.labels.append(
|
||||
torch_nested_numpify(torch_nested_detach(ground_truths)))
|
||||
self.preds.append(eval_results)
|
||||
self.labels.append(ground_truths)
|
||||
|
||||
def evaluate(self):
|
||||
psnr_list, ssim_list = [], []
|
||||
for (pred, label) in zip(self.preds, self.labels):
|
||||
psnr_list.append(
|
||||
peak_signal_noise_ratio(label[0], pred[0], data_range=255))
|
||||
ssim_list.append(
|
||||
structural_similarity(
|
||||
label[0], pred[0], multichannel=True, data_range=255))
|
||||
psnr_list.append(calculate_psnr(label[0], pred[0], crop_border=0))
|
||||
ssim_list.append(calculate_ssim(label[0], pred[0], crop_border=0))
|
||||
return {
|
||||
MetricKeys.PSNR: np.mean(psnr_list),
|
||||
MetricKeys.SSIM: np.mean(ssim_list)
|
||||
}
|
||||
|
||||
|
||||
def reorder_image(img, input_order='HWC'):
|
||||
"""Reorder images to 'HWC' order.
|
||||
If the input_order is (h, w), return (h, w, 1);
|
||||
If the input_order is (c, h, w), return (h, w, c);
|
||||
If the input_order is (h, w, c), return as it is.
|
||||
Args:
|
||||
img (ndarray): Input image.
|
||||
input_order (str): Whether the input order is 'HWC' or 'CHW'.
|
||||
If the input image shape is (h, w), input_order will not have
|
||||
effects. Default: 'HWC'.
|
||||
Returns:
|
||||
ndarray: reordered image.
|
||||
"""
|
||||
|
||||
if input_order not in ['HWC', 'CHW']:
|
||||
raise ValueError(
|
||||
f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'"
|
||||
)
|
||||
if len(img.shape) == 2:
|
||||
img = img[..., None]
|
||||
if input_order == 'CHW':
|
||||
img = img.transpose(1, 2, 0)
|
||||
return img
|
||||
|
||||
|
||||
def calculate_psnr(img1, img2, crop_border, input_order='HWC'):
|
||||
"""Calculate PSNR (Peak Signal-to-Noise Ratio).
|
||||
Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
|
||||
Args:
|
||||
img1 (ndarray/tensor): Images with range [0, 255]/[0, 1].
|
||||
img2 (ndarray/tensor): Images with range [0, 255]/[0, 1].
|
||||
crop_border (int): Cropped pixels in each edge of an image. These
|
||||
pixels are not involved in the PSNR calculation.
|
||||
input_order (str): Whether the input order is 'HWC' or 'CHW'.
|
||||
Default: 'HWC'.
|
||||
test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
|
||||
Returns:
|
||||
float: psnr result.
|
||||
"""
|
||||
|
||||
assert img1.shape == img2.shape, (
|
||||
f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
|
||||
if input_order not in ['HWC', 'CHW']:
|
||||
raise ValueError(
|
||||
f'Wrong input_order {input_order}. Supported input_orders are '
|
||||
'"HWC" and "CHW"')
|
||||
if type(img1) == torch.Tensor:
|
||||
if len(img1.shape) == 4:
|
||||
img1 = img1.squeeze(0)
|
||||
img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
|
||||
if type(img2) == torch.Tensor:
|
||||
if len(img2.shape) == 4:
|
||||
img2 = img2.squeeze(0)
|
||||
img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)
|
||||
|
||||
img1 = reorder_image(img1, input_order=input_order)
|
||||
img2 = reorder_image(img2, input_order=input_order)
|
||||
img1 = img1.astype(np.float64)
|
||||
img2 = img2.astype(np.float64)
|
||||
|
||||
if crop_border != 0:
|
||||
img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
|
||||
img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
|
||||
|
||||
def _psnr(img1, img2):
|
||||
|
||||
mse = np.mean((img1 - img2)**2)
|
||||
if mse == 0:
|
||||
return float('inf')
|
||||
max_value = 1. if img1.max() <= 1 else 255.
|
||||
return 20. * np.log10(max_value / np.sqrt(mse))
|
||||
|
||||
return _psnr(img1, img2)
|
||||
|
||||
|
||||
def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True):
|
||||
"""Calculate SSIM (structural similarity).
|
||||
Ref:
|
||||
Image quality assessment: From error visibility to structural similarity
|
||||
The results are the same as that of the official released MATLAB code in
|
||||
https://ece.uwaterloo.ca/~z70wang/research/ssim/.
|
||||
For three-channel images, SSIM is calculated for each channel and then
|
||||
averaged.
|
||||
Args:
|
||||
img1 (ndarray): Images with range [0, 255].
|
||||
img2 (ndarray): Images with range [0, 255].
|
||||
crop_border (int): Cropped pixels in each edge of an image. These
|
||||
pixels are not involved in the SSIM calculation.
|
||||
input_order (str): Whether the input order is 'HWC' or 'CHW'.
|
||||
Default: 'HWC'.
|
||||
test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
|
||||
Returns:
|
||||
float: ssim result.
|
||||
"""
|
||||
|
||||
assert img1.shape == img2.shape, (
|
||||
f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
|
||||
if input_order not in ['HWC', 'CHW']:
|
||||
raise ValueError(
|
||||
f'Wrong input_order {input_order}. Supported input_orders are '
|
||||
'"HWC" and "CHW"')
|
||||
|
||||
if type(img1) == torch.Tensor:
|
||||
if len(img1.shape) == 4:
|
||||
img1 = img1.squeeze(0)
|
||||
img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
|
||||
if type(img2) == torch.Tensor:
|
||||
if len(img2.shape) == 4:
|
||||
img2 = img2.squeeze(0)
|
||||
img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)
|
||||
|
||||
img1 = reorder_image(img1, input_order=input_order)
|
||||
img2 = reorder_image(img2, input_order=input_order)
|
||||
|
||||
img1 = img1.astype(np.float64)
|
||||
img2 = img2.astype(np.float64)
|
||||
|
||||
if crop_border != 0:
|
||||
img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
|
||||
img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
|
||||
|
||||
def _cal_ssim(img1, img2):
|
||||
ssims = []
|
||||
|
||||
max_value = 1 if img1.max() <= 1 else 255
|
||||
with torch.no_grad():
|
||||
final_ssim = _ssim_3d(img1, img2, max_value) if ssim3d else _ssim(
|
||||
img1, img2, max_value)
|
||||
ssims.append(final_ssim)
|
||||
|
||||
return np.array(ssims).mean()
|
||||
|
||||
return _cal_ssim(img1, img2)
|
||||
|
||||
|
||||
def _ssim(img, img2, max_value):
|
||||
"""Calculate SSIM (structural similarity) for one channel images.
|
||||
It is called by func:`calculate_ssim`.
|
||||
Args:
|
||||
img (ndarray): Images with range [0, 255] with order 'HWC'.
|
||||
img2 (ndarray): Images with range [0, 255] with order 'HWC'.
|
||||
Returns:
|
||||
float: SSIM result.
|
||||
"""
|
||||
|
||||
c1 = (0.01 * max_value)**2
|
||||
c2 = (0.03 * max_value)**2
|
||||
|
||||
img = img.astype(np.float64)
|
||||
img2 = img2.astype(np.float64)
|
||||
kernel = cv2.getGaussianKernel(11, 1.5)
|
||||
window = np.outer(kernel, kernel.transpose())
|
||||
|
||||
mu1 = cv2.filter2D(img, -1, window)[5:-5,
|
||||
5:-5] # valid mode for window size 11
|
||||
mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
|
||||
mu1_sq = mu1**2
|
||||
mu2_sq = mu2**2
|
||||
mu1_mu2 = mu1 * mu2
|
||||
sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq
|
||||
sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
|
||||
sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
|
||||
|
||||
tmp1 = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
|
||||
tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
|
||||
ssim_map = tmp1 / tmp2
|
||||
return ssim_map.mean()
|
||||
|
||||
|
||||
def _3d_gaussian_calculator(img, conv3d):
|
||||
out = conv3d(img.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0)
|
||||
return out
|
||||
|
||||
|
||||
def _generate_3d_gaussian_kernel():
|
||||
kernel = cv2.getGaussianKernel(11, 1.5)
|
||||
window = np.outer(kernel, kernel.transpose())
|
||||
kernel_3 = cv2.getGaussianKernel(11, 1.5)
|
||||
kernel = torch.tensor(np.stack([window * k for k in kernel_3], axis=0))
|
||||
conv3d = torch.nn.Conv3d(
|
||||
1,
|
||||
1, (11, 11, 11),
|
||||
stride=1,
|
||||
padding=(5, 5, 5),
|
||||
bias=False,
|
||||
padding_mode='replicate')
|
||||
conv3d.weight.requires_grad = False
|
||||
conv3d.weight[0, 0, :, :, :] = kernel
|
||||
return conv3d
|
||||
|
||||
|
||||
def _ssim_3d(img1, img2, max_value):
|
||||
assert len(img1.shape) == 3 and len(img2.shape) == 3
|
||||
"""Calculate SSIM (structural similarity) for one channel images.
|
||||
It is called by func:`calculate_ssim`.
|
||||
Args:
|
||||
img1 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
|
||||
img2 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
|
||||
Returns:
|
||||
float: ssim result.
|
||||
"""
|
||||
C1 = (0.01 * max_value)**2
|
||||
C2 = (0.03 * max_value)**2
|
||||
img1 = img1.astype(np.float64)
|
||||
img2 = img2.astype(np.float64)
|
||||
|
||||
kernel = _generate_3d_gaussian_kernel().cuda()
|
||||
|
||||
img1 = torch.tensor(img1).float().cuda()
|
||||
img2 = torch.tensor(img2).float().cuda()
|
||||
|
||||
mu1 = _3d_gaussian_calculator(img1, kernel)
|
||||
mu2 = _3d_gaussian_calculator(img2, kernel)
|
||||
|
||||
mu1_sq = mu1**2
|
||||
mu2_sq = mu2**2
|
||||
mu1_mu2 = mu1 * mu2
|
||||
sigma1_sq = _3d_gaussian_calculator(img1**2, kernel) - mu1_sq
|
||||
sigma2_sq = _3d_gaussian_calculator(img2**2, kernel) - mu2_sq
|
||||
sigma12 = _3d_gaussian_calculator(img1 * img2, kernel) - mu1_mu2
|
||||
|
||||
tmp1 = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2)
|
||||
tmp2 = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
|
||||
ssim_map = tmp1 / tmp2
|
||||
return float(ssim_map.mean())
|
||||
|
||||
210
modelscope/metrics/image_inpainting_metric.py
Normal file
210
modelscope/metrics/image_inpainting_metric.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
from scipy import linalg
|
||||
|
||||
from modelscope.metainfo import Metrics
|
||||
from modelscope.models.cv.image_inpainting.modules.inception import InceptionV3
|
||||
from modelscope.utils.registry import default_group
|
||||
from modelscope.utils.tensor_utils import (torch_nested_detach,
|
||||
torch_nested_numpify)
|
||||
from .base import Metric
|
||||
from .builder import METRICS, MetricKeys
|
||||
|
||||
|
||||
def fid_calculate_activation_statistics(act):
|
||||
mu = np.mean(act, axis=0)
|
||||
sigma = np.cov(act, rowvar=False)
|
||||
return mu, sigma
|
||||
|
||||
|
||||
def calculate_frechet_distance(activations_pred, activations_target, eps=1e-6):
|
||||
mu1, sigma1 = fid_calculate_activation_statistics(activations_pred)
|
||||
mu2, sigma2 = fid_calculate_activation_statistics(activations_target)
|
||||
|
||||
diff = mu1 - mu2
|
||||
|
||||
# Product might be almost singular
|
||||
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
|
||||
if not np.isfinite(covmean).all():
|
||||
offset = np.eye(sigma1.shape[0]) * eps
|
||||
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
|
||||
|
||||
# Numerical error might give slight imaginary component
|
||||
if np.iscomplexobj(covmean):
|
||||
# if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
|
||||
if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-2):
|
||||
m = np.max(np.abs(covmean.imag))
|
||||
raise ValueError('Imaginary component {}'.format(m))
|
||||
covmean = covmean.real
|
||||
|
||||
tr_covmean = np.trace(covmean)
|
||||
|
||||
return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2)
|
||||
- 2 * tr_covmean)
|
||||
|
||||
|
||||
class FIDScore(torch.nn.Module):
|
||||
|
||||
def __init__(self, dims=2048, eps=1e-6):
|
||||
super().__init__()
|
||||
if getattr(FIDScore, '_MODEL', None) is None:
|
||||
block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
|
||||
FIDScore._MODEL = InceptionV3([block_idx]).eval()
|
||||
self.model = FIDScore._MODEL
|
||||
self.eps = eps
|
||||
self.reset()
|
||||
|
||||
def forward(self, pred_batch, target_batch, mask=None):
|
||||
activations_pred = self._get_activations(pred_batch)
|
||||
activations_target = self._get_activations(target_batch)
|
||||
|
||||
self.activations_pred.append(activations_pred.detach().cpu())
|
||||
self.activations_target.append(activations_target.detach().cpu())
|
||||
|
||||
def get_value(self):
|
||||
activations_pred, activations_target = (self.activations_pred,
|
||||
self.activations_target)
|
||||
activations_pred = torch.cat(activations_pred).cpu().numpy()
|
||||
activations_target = torch.cat(activations_target).cpu().numpy()
|
||||
|
||||
total_distance = calculate_frechet_distance(
|
||||
activations_pred, activations_target, eps=self.eps)
|
||||
|
||||
self.reset()
|
||||
return total_distance
|
||||
|
||||
def reset(self):
|
||||
self.activations_pred = []
|
||||
self.activations_target = []
|
||||
|
||||
def _get_activations(self, batch):
|
||||
activations = self.model(batch)[0]
|
||||
if activations.shape[2] != 1 or activations.shape[3] != 1:
|
||||
assert False, \
|
||||
'We should not have got here, because Inception always scales inputs to 299x299'
|
||||
activations = activations.squeeze(-1).squeeze(-1)
|
||||
return activations
|
||||
|
||||
|
||||
class SSIM(torch.nn.Module):
|
||||
"""SSIM. Modified from:
|
||||
https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py
|
||||
"""
|
||||
|
||||
def __init__(self, window_size=11, size_average=True):
|
||||
super().__init__()
|
||||
self.window_size = window_size
|
||||
self.size_average = size_average
|
||||
self.channel = 1
|
||||
self.register_buffer('window',
|
||||
self._create_window(window_size, self.channel))
|
||||
|
||||
def forward(self, img1, img2):
|
||||
assert len(img1.shape) == 4
|
||||
|
||||
channel = img1.size()[1]
|
||||
|
||||
if channel == self.channel and self.window.data.type(
|
||||
) == img1.data.type():
|
||||
window = self.window
|
||||
else:
|
||||
window = self._create_window(self.window_size, channel)
|
||||
|
||||
window = window.type_as(img1)
|
||||
|
||||
self.window = window
|
||||
self.channel = channel
|
||||
|
||||
return self._ssim(img1, img2, window, self.window_size, channel,
|
||||
self.size_average)
|
||||
|
||||
def _gaussian(self, window_size, sigma):
|
||||
gauss = torch.Tensor([
|
||||
np.exp(-(x - (window_size // 2))**2 / float(2 * sigma**2))
|
||||
for x in range(window_size)
|
||||
])
|
||||
return gauss / gauss.sum()
|
||||
|
||||
def _create_window(self, window_size, channel):
|
||||
_1D_window = self._gaussian(window_size, 1.5).unsqueeze(1)
|
||||
_2D_window = _1D_window.mm(
|
||||
_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
|
||||
return _2D_window.expand(channel, 1, window_size,
|
||||
window_size).contiguous()
|
||||
|
||||
def _ssim(self,
|
||||
img1,
|
||||
img2,
|
||||
window,
|
||||
window_size,
|
||||
channel,
|
||||
size_average=True):
|
||||
mu1 = F.conv2d(
|
||||
img1, window, padding=(window_size // 2), groups=channel)
|
||||
mu2 = F.conv2d(
|
||||
img2, window, padding=(window_size // 2), groups=channel)
|
||||
|
||||
mu1_sq = mu1.pow(2)
|
||||
mu2_sq = mu2.pow(2)
|
||||
mu1_mu2 = mu1 * mu2
|
||||
|
||||
sigma1_sq = F.conv2d(
|
||||
img1 * img1, window, padding=(window_size // 2),
|
||||
groups=channel) - mu1_sq
|
||||
sigma2_sq = F.conv2d(
|
||||
img2 * img2, window, padding=(window_size // 2),
|
||||
groups=channel) - mu2_sq
|
||||
sigma12 = F.conv2d(
|
||||
img1 * img2, window, padding=(window_size // 2),
|
||||
groups=channel) - mu1_mu2
|
||||
|
||||
C1 = 0.01**2
|
||||
C2 = 0.03**2
|
||||
|
||||
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \
|
||||
((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
|
||||
|
||||
if size_average:
|
||||
return ssim_map.mean()
|
||||
|
||||
return ssim_map.mean(1).mean(1).mean(1)
|
||||
|
||||
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
|
||||
missing_keys, unexpected_keys, error_msgs):
|
||||
return
|
||||
|
||||
|
||||
@METRICS.register_module(
|
||||
group_key=default_group, module_name=Metrics.image_inpainting_metric)
|
||||
class ImageInpaintingMetric(Metric):
|
||||
"""The metric computation class for image inpainting classes.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.preds = []
|
||||
self.targets = []
|
||||
self.SSIM = SSIM(window_size=11, size_average=False).eval()
|
||||
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
||||
self.FID = FIDScore().to(device)
|
||||
|
||||
def add(self, outputs: Dict, inputs: Dict):
|
||||
pred = outputs['inpainted']
|
||||
target = inputs['image']
|
||||
self.preds.append(torch_nested_detach(pred))
|
||||
self.targets.append(torch_nested_detach(target))
|
||||
|
||||
def evaluate(self):
|
||||
ssim_list = []
|
||||
for (pred, target) in zip(self.preds, self.targets):
|
||||
ssim_list.append(self.SSIM(pred, target))
|
||||
self.FID(pred, target)
|
||||
ssim_list = torch_nested_numpify(ssim_list)
|
||||
fid = self.FID.get_value()
|
||||
return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid}
|
||||
@@ -1,3 +1,6 @@
|
||||
# Part of the implementation is borrowed and modified from PGL-SUM,
|
||||
# publicly available at https://github.com/e-apostolidis/PGL-SUM
|
||||
|
||||
from typing import Dict
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
|
||||
@@ -1,15 +1,14 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import torch
|
||||
from typing import Dict, Optional
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models import TorchModel
|
||||
from modelscope.models.base import Tensor
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.audio.audio_utils import update_conf
|
||||
from modelscope.utils.constant import Tasks
|
||||
from .fsmn_sele_v2 import FSMNSeleNetV2
|
||||
|
||||
|
||||
@@ -20,48 +19,38 @@ class FSMNSeleNetV2Decorator(TorchModel):
|
||||
|
||||
MODEL_TXT = 'model.txt'
|
||||
SC_CONFIG = 'sound_connect.conf'
|
||||
SC_CONF_ITEM_KWS_MODEL = '${kws_model}'
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
def __init__(self,
|
||||
model_dir: str,
|
||||
training: Optional[bool] = False,
|
||||
*args,
|
||||
**kwargs):
|
||||
"""initialize the dfsmn model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
|
||||
model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
|
||||
model_bin_file = os.path.join(model_dir,
|
||||
ModelFile.TORCH_MODEL_BIN_FILE)
|
||||
self._model = None
|
||||
if os.path.exists(model_bin_file):
|
||||
kwargs.pop('device')
|
||||
self._model = FSMNSeleNetV2(*args, **kwargs)
|
||||
checkpoint = torch.load(model_bin_file)
|
||||
self._model.load_state_dict(checkpoint, strict=False)
|
||||
|
||||
self._sc = None
|
||||
if os.path.exists(model_txt_file):
|
||||
with open(sc_config_file) as f:
|
||||
lines = f.readlines()
|
||||
with open(sc_config_file, 'w') as f:
|
||||
for line in lines:
|
||||
if self.SC_CONF_ITEM_KWS_MODEL in line:
|
||||
line = line.replace(self.SC_CONF_ITEM_KWS_MODEL,
|
||||
model_txt_file)
|
||||
f.write(line)
|
||||
import py_sound_connect
|
||||
self._sc = py_sound_connect.SoundConnect(sc_config_file)
|
||||
self.size_in = self._sc.bytesPerBlockIn()
|
||||
self.size_out = self._sc.bytesPerBlockOut()
|
||||
|
||||
if self._model is None and self._sc is None:
|
||||
raise Exception(
|
||||
f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.'
|
||||
)
|
||||
if training:
|
||||
self.model = FSMNSeleNetV2(*args, **kwargs)
|
||||
else:
|
||||
sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
|
||||
model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
|
||||
self._sc = None
|
||||
if os.path.exists(model_txt_file):
|
||||
conf_dict = dict(mode=56542, kws_model=model_txt_file)
|
||||
update_conf(sc_config_file, sc_config_file, conf_dict)
|
||||
import py_sound_connect
|
||||
self._sc = py_sound_connect.SoundConnect(sc_config_file)
|
||||
self.size_in = self._sc.bytesPerBlockIn()
|
||||
self.size_out = self._sc.bytesPerBlockOut()
|
||||
else:
|
||||
raise Exception(
|
||||
f'Invalid model directory! Failed to load model file: {model_txt_file}.'
|
||||
)
|
||||
|
||||
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
|
||||
...
|
||||
return self.model.forward(input)
|
||||
|
||||
def forward_decode(self, data: bytes):
|
||||
result = {'pcm': self._sc.process(data, self.size_out)}
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
|
||||
0
modelscope/models/audio/tts/models/datasets/__init__.py
Executable file → Normal file
0
modelscope/models/audio/tts/models/datasets/__init__.py
Executable file → Normal file
@@ -4,14 +4,16 @@
|
||||
from . import (action_recognition, animal_recognition, body_2d_keypoints,
|
||||
body_3d_keypoints, cartoon, cmdssl_video_embedding,
|
||||
crowd_counting, face_2d_keypoints, face_detection,
|
||||
face_generation, image_classification, image_color_enhance,
|
||||
image_colorization, image_denoise, image_instance_segmentation,
|
||||
face_generation, human_wholebody_keypoint, image_classification,
|
||||
image_color_enhance, image_colorization, image_denoise,
|
||||
image_inpainting, image_instance_segmentation,
|
||||
image_panoptic_segmentation, image_portrait_enhancement,
|
||||
image_reid_person, image_semantic_segmentation,
|
||||
image_to_image_generation, image_to_image_translation,
|
||||
movie_scene_segmentation, object_detection,
|
||||
product_retrieval_embedding, realtime_object_detection,
|
||||
salient_detection, shop_segmentation, super_resolution,
|
||||
referring_video_object_segmentation, salient_detection,
|
||||
shop_segmentation, super_resolution,
|
||||
video_single_object_tracking, video_summarization, virual_tryon)
|
||||
|
||||
# yapf: enable
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import os
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
|
||||
@@ -1,10 +1,10 @@
|
||||
# ------------------------------------------------------------------------------
|
||||
# Copyright (c) Microsoft
|
||||
# Licensed under the MIT License.
|
||||
# Written by Bin Xiao (Bin.Xiao@microsoft.com)
|
||||
# Modified by Ke Sun (sunk@mail.ustc.edu.cn)
|
||||
# https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
|
||||
# ------------------------------------------------------------------------------
|
||||
"""
|
||||
Copyright (c) Microsoft
|
||||
Licensed under the MIT License.
|
||||
Written by Bin Xiao (Bin.Xiao@microsoft.com)
|
||||
Modified by Ke Sun (sunk@mail.ustc.edu.cn)
|
||||
https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
|
||||
"""
|
||||
|
||||
import functools
|
||||
import logging
|
||||
|
||||
@@ -8,12 +8,14 @@ if TYPE_CHECKING:
|
||||
from .mtcnn import MtcnnFaceDetector
|
||||
from .retinaface import RetinaFaceDetection
|
||||
from .ulfd_slim import UlfdFaceDetector
|
||||
from .scrfd import ScrfdDetect
|
||||
else:
|
||||
_import_structure = {
|
||||
'ulfd_slim': ['UlfdFaceDetector'],
|
||||
'retinaface': ['RetinaFaceDetection'],
|
||||
'mtcnn': ['MtcnnFaceDetector'],
|
||||
'mogface': ['MogFaceDetector']
|
||||
'mogface': ['MogFaceDetector'],
|
||||
'scrfd': ['ScrfdDetect']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -1,189 +0,0 @@
|
||||
"""
|
||||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
|
||||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
|
||||
"""
|
||||
import numpy as np
|
||||
from mmdet.datasets.builder import PIPELINES
|
||||
from numpy import random
|
||||
|
||||
|
||||
@PIPELINES.register_module()
|
||||
class RandomSquareCrop(object):
|
||||
"""Random crop the image & bboxes, the cropped patches have minimum IoU
|
||||
requirement with original image & bboxes, the IoU threshold is randomly
|
||||
selected from min_ious.
|
||||
|
||||
Args:
|
||||
min_ious (tuple): minimum IoU threshold for all intersections with
|
||||
bounding boxes
|
||||
min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
|
||||
where a >= min_crop_size).
|
||||
|
||||
Note:
|
||||
The keys for bboxes, labels and masks should be paired. That is, \
|
||||
`gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
|
||||
`gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
crop_ratio_range=None,
|
||||
crop_choice=None,
|
||||
bbox_clip_border=True):
|
||||
|
||||
self.crop_ratio_range = crop_ratio_range
|
||||
self.crop_choice = crop_choice
|
||||
self.bbox_clip_border = bbox_clip_border
|
||||
|
||||
assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
|
||||
if self.crop_ratio_range is not None:
|
||||
self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
|
||||
|
||||
self.bbox2label = {
|
||||
'gt_bboxes': 'gt_labels',
|
||||
'gt_bboxes_ignore': 'gt_labels_ignore'
|
||||
}
|
||||
self.bbox2mask = {
|
||||
'gt_bboxes': 'gt_masks',
|
||||
'gt_bboxes_ignore': 'gt_masks_ignore'
|
||||
}
|
||||
|
||||
def __call__(self, results):
|
||||
"""Call function to crop images and bounding boxes with minimum IoU
|
||||
constraint.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from loading pipeline.
|
||||
|
||||
Returns:
|
||||
dict: Result dict with images and bounding boxes cropped, \
|
||||
'img_shape' key is updated.
|
||||
"""
|
||||
|
||||
if 'img_fields' in results:
|
||||
assert results['img_fields'] == ['img'], \
|
||||
'Only single img_fields is allowed'
|
||||
img = results['img']
|
||||
assert 'bbox_fields' in results
|
||||
assert 'gt_bboxes' in results
|
||||
boxes = results['gt_bboxes']
|
||||
h, w, c = img.shape
|
||||
scale_retry = 0
|
||||
if self.crop_ratio_range is not None:
|
||||
max_scale = self.crop_ratio_max
|
||||
else:
|
||||
max_scale = np.amax(self.crop_choice)
|
||||
while True:
|
||||
scale_retry += 1
|
||||
|
||||
if scale_retry == 1 or max_scale > 1.0:
|
||||
if self.crop_ratio_range is not None:
|
||||
scale = np.random.uniform(self.crop_ratio_min,
|
||||
self.crop_ratio_max)
|
||||
elif self.crop_choice is not None:
|
||||
scale = np.random.choice(self.crop_choice)
|
||||
else:
|
||||
scale = scale * 1.2
|
||||
|
||||
for i in range(250):
|
||||
short_side = min(w, h)
|
||||
cw = int(scale * short_side)
|
||||
ch = cw
|
||||
|
||||
# TODO +1
|
||||
if w == cw:
|
||||
left = 0
|
||||
elif w > cw:
|
||||
left = random.randint(0, w - cw)
|
||||
else:
|
||||
left = random.randint(w - cw, 0)
|
||||
if h == ch:
|
||||
top = 0
|
||||
elif h > ch:
|
||||
top = random.randint(0, h - ch)
|
||||
else:
|
||||
top = random.randint(h - ch, 0)
|
||||
|
||||
patch = np.array(
|
||||
(int(left), int(top), int(left + cw), int(top + ch)),
|
||||
dtype=np.int)
|
||||
|
||||
# center of boxes should inside the crop img
|
||||
# only adjust boxes and instance masks when the gt is not empty
|
||||
# adjust boxes
|
||||
def is_center_of_bboxes_in_patch(boxes, patch):
|
||||
# TODO >=
|
||||
center = (boxes[:, :2] + boxes[:, 2:]) / 2
|
||||
mask = \
|
||||
((center[:, 0] > patch[0])
|
||||
* (center[:, 1] > patch[1])
|
||||
* (center[:, 0] < patch[2])
|
||||
* (center[:, 1] < patch[3]))
|
||||
return mask
|
||||
|
||||
mask = is_center_of_bboxes_in_patch(boxes, patch)
|
||||
if not mask.any():
|
||||
continue
|
||||
for key in results.get('bbox_fields', []):
|
||||
boxes = results[key].copy()
|
||||
mask = is_center_of_bboxes_in_patch(boxes, patch)
|
||||
boxes = boxes[mask]
|
||||
if self.bbox_clip_border:
|
||||
boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
|
||||
boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
|
||||
boxes -= np.tile(patch[:2], 2)
|
||||
|
||||
results[key] = boxes
|
||||
# labels
|
||||
label_key = self.bbox2label.get(key)
|
||||
if label_key in results:
|
||||
results[label_key] = results[label_key][mask]
|
||||
|
||||
# keypoints field
|
||||
if key == 'gt_bboxes':
|
||||
for kps_key in results.get('keypoints_fields', []):
|
||||
keypointss = results[kps_key].copy()
|
||||
keypointss = keypointss[mask, :, :]
|
||||
if self.bbox_clip_border:
|
||||
keypointss[:, :, :
|
||||
2] = keypointss[:, :, :2].clip(
|
||||
max=patch[2:])
|
||||
keypointss[:, :, :
|
||||
2] = keypointss[:, :, :2].clip(
|
||||
min=patch[:2])
|
||||
keypointss[:, :, 0] -= patch[0]
|
||||
keypointss[:, :, 1] -= patch[1]
|
||||
results[kps_key] = keypointss
|
||||
|
||||
# mask fields
|
||||
mask_key = self.bbox2mask.get(key)
|
||||
if mask_key in results:
|
||||
results[mask_key] = results[mask_key][mask.nonzero()
|
||||
[0]].crop(patch)
|
||||
|
||||
# adjust the img no matter whether the gt is empty before crop
|
||||
rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
|
||||
patch_from = patch.copy()
|
||||
patch_from[0] = max(0, patch_from[0])
|
||||
patch_from[1] = max(0, patch_from[1])
|
||||
patch_from[2] = min(img.shape[1], patch_from[2])
|
||||
patch_from[3] = min(img.shape[0], patch_from[3])
|
||||
patch_to = patch.copy()
|
||||
patch_to[0] = max(0, patch_to[0] * -1)
|
||||
patch_to[1] = max(0, patch_to[1] * -1)
|
||||
patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
|
||||
patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
|
||||
rimg[patch_to[1]:patch_to[3],
|
||||
patch_to[0]:patch_to[2], :] = img[
|
||||
patch_from[1]:patch_from[3],
|
||||
patch_from[0]:patch_from[2], :]
|
||||
img = rimg
|
||||
results['img'] = img
|
||||
results['img_shape'] = img.shape
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = self.__class__.__name__
|
||||
repr_str += f'(min_ious={self.min_iou}, '
|
||||
repr_str += f'crop_size={self.crop_size})'
|
||||
return repr_str
|
||||
@@ -1,3 +1,5 @@
|
||||
# The implementation is based on MogFace, available at
|
||||
# https://github.com/damo-cv/MogFace
|
||||
import os
|
||||
|
||||
import cv2
|
||||
|
||||
2
modelscope/models/cv/face_detection/scrfd/__init__.py
Normal file
2
modelscope/models/cv/face_detection/scrfd/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .scrfd_detect import ScrfdDetect
|
||||
@@ -6,7 +6,7 @@ import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def bbox2result(bboxes, labels, num_classes, kps=None):
|
||||
def bbox2result(bboxes, labels, num_classes, kps=None, num_kps=5):
|
||||
"""Convert detection results to a list of numpy arrays.
|
||||
|
||||
Args:
|
||||
@@ -17,7 +17,7 @@ def bbox2result(bboxes, labels, num_classes, kps=None):
|
||||
Returns:
|
||||
list(ndarray): bbox results of each class
|
||||
"""
|
||||
bbox_len = 5 if kps is None else 5 + 10 # if has kps, add 10 kps into bbox
|
||||
bbox_len = 5 if kps is None else 5 + num_kps * 2 # if has kps, add num_kps*2 into bbox
|
||||
if bboxes.shape[0] == 0:
|
||||
return [
|
||||
np.zeros((0, bbox_len), dtype=np.float32)
|
||||
@@ -17,6 +17,7 @@ def multiclass_nms(multi_bboxes,
|
||||
|
||||
Args:
|
||||
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
|
||||
multi_kps (Tensor): shape (n, #class*num_kps*2) or (n, num_kps*2)
|
||||
multi_scores (Tensor): shape (n, #class), where the last column
|
||||
contains scores of the background class, but this will be ignored.
|
||||
score_thr (float): bbox threshold, bboxes with scores lower than it
|
||||
@@ -36,16 +37,18 @@ def multiclass_nms(multi_bboxes,
|
||||
num_classes = multi_scores.size(1) - 1
|
||||
# exclude background category
|
||||
kps = None
|
||||
if multi_kps is not None:
|
||||
num_kps = int((multi_kps.shape[1] / num_classes) / 2)
|
||||
if multi_bboxes.shape[1] > 4:
|
||||
bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
|
||||
if multi_kps is not None:
|
||||
kps = multi_kps.view(multi_scores.size(0), -1, 10)
|
||||
kps = multi_kps.view(multi_scores.size(0), -1, num_kps * 2)
|
||||
else:
|
||||
bboxes = multi_bboxes[:, None].expand(
|
||||
multi_scores.size(0), num_classes, 4)
|
||||
if multi_kps is not None:
|
||||
kps = multi_kps[:, None].expand(
|
||||
multi_scores.size(0), num_classes, 10)
|
||||
multi_scores.size(0), num_classes, num_kps * 2)
|
||||
|
||||
scores = multi_scores[:, :-1]
|
||||
if score_factors is not None:
|
||||
@@ -56,7 +59,7 @@ def multiclass_nms(multi_bboxes,
|
||||
|
||||
bboxes = bboxes.reshape(-1, 4)
|
||||
if kps is not None:
|
||||
kps = kps.reshape(-1, 10)
|
||||
kps = kps.reshape(-1, num_kps * 2)
|
||||
scores = scores.reshape(-1)
|
||||
labels = labels.reshape(-1)
|
||||
|
||||
@@ -2,6 +2,12 @@
|
||||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
|
||||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
|
||||
"""
|
||||
from .auto_augment import RotateV2
|
||||
from .formating import DefaultFormatBundleV2
|
||||
from .loading import LoadAnnotationsV2
|
||||
from .transforms import RandomSquareCrop
|
||||
|
||||
__all__ = ['RandomSquareCrop']
|
||||
__all__ = [
|
||||
'RandomSquareCrop', 'LoadAnnotationsV2', 'RotateV2',
|
||||
'DefaultFormatBundleV2'
|
||||
]
|
||||
@@ -0,0 +1,271 @@
|
||||
"""
|
||||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
|
||||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py
|
||||
"""
|
||||
import copy
|
||||
|
||||
import cv2
|
||||
import mmcv
|
||||
import numpy as np
|
||||
from mmdet.datasets.builder import PIPELINES
|
||||
|
||||
_MAX_LEVEL = 10
|
||||
|
||||
|
||||
def level_to_value(level, max_value):
|
||||
"""Map from level to values based on max_value."""
|
||||
return (level / _MAX_LEVEL) * max_value
|
||||
|
||||
|
||||
def random_negative(value, random_negative_prob):
|
||||
"""Randomly negate value based on random_negative_prob."""
|
||||
return -value if np.random.rand() < random_negative_prob else value
|
||||
|
||||
|
||||
def bbox2fields():
|
||||
"""The key correspondence from bboxes to labels, masks and
|
||||
segmentations."""
|
||||
bbox2label = {
|
||||
'gt_bboxes': 'gt_labels',
|
||||
'gt_bboxes_ignore': 'gt_labels_ignore'
|
||||
}
|
||||
bbox2mask = {
|
||||
'gt_bboxes': 'gt_masks',
|
||||
'gt_bboxes_ignore': 'gt_masks_ignore'
|
||||
}
|
||||
bbox2seg = {
|
||||
'gt_bboxes': 'gt_semantic_seg',
|
||||
}
|
||||
return bbox2label, bbox2mask, bbox2seg
|
||||
|
||||
|
||||
@PIPELINES.register_module()
|
||||
class RotateV2(object):
|
||||
"""Apply Rotate Transformation to image (and its corresponding bbox, mask,
|
||||
segmentation).
|
||||
|
||||
Args:
|
||||
level (int | float): The level should be in range (0,_MAX_LEVEL].
|
||||
scale (int | float): Isotropic scale factor. Same in
|
||||
``mmcv.imrotate``.
|
||||
center (int | float | tuple[float]): Center point (w, h) of the
|
||||
rotation in the source image. If None, the center of the
|
||||
image will be used. Same in ``mmcv.imrotate``.
|
||||
img_fill_val (int | float | tuple): The fill value for image border.
|
||||
If float, the same value will be used for all the three
|
||||
channels of image. If tuple, the should be 3 elements (e.g.
|
||||
equals the number of channels for image).
|
||||
seg_ignore_label (int): The fill value used for segmentation map.
|
||||
Note this value must equals ``ignore_label`` in ``semantic_head``
|
||||
of the corresponding config. Default 255.
|
||||
prob (float): The probability for perform transformation and
|
||||
should be in range 0 to 1.
|
||||
max_rotate_angle (int | float): The maximum angles for rotate
|
||||
transformation.
|
||||
random_negative_prob (float): The probability that turns the
|
||||
offset negative.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
level,
|
||||
scale=1,
|
||||
center=None,
|
||||
img_fill_val=128,
|
||||
seg_ignore_label=255,
|
||||
prob=0.5,
|
||||
max_rotate_angle=30,
|
||||
random_negative_prob=0.5):
|
||||
assert isinstance(level, (int, float)), \
|
||||
f'The level must be type int or float. got {type(level)}.'
|
||||
assert 0 <= level <= _MAX_LEVEL, \
|
||||
f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.'
|
||||
assert isinstance(scale, (int, float)), \
|
||||
f'The scale must be type int or float. got type {type(scale)}.'
|
||||
if isinstance(center, (int, float)):
|
||||
center = (center, center)
|
||||
elif isinstance(center, tuple):
|
||||
assert len(center) == 2, 'center with type tuple must have '\
|
||||
f'2 elements. got {len(center)} elements.'
|
||||
else:
|
||||
assert center is None, 'center must be None or type int, '\
|
||||
f'float or tuple, got type {type(center)}.'
|
||||
if isinstance(img_fill_val, (float, int)):
|
||||
img_fill_val = tuple([float(img_fill_val)] * 3)
|
||||
elif isinstance(img_fill_val, tuple):
|
||||
assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\
|
||||
f'have 3 elements. got {len(img_fill_val)}.'
|
||||
img_fill_val = tuple([float(val) for val in img_fill_val])
|
||||
else:
|
||||
raise ValueError(
|
||||
'img_fill_val must be float or tuple with 3 elements.')
|
||||
assert np.all([0 <= val <= 255 for val in img_fill_val]), \
|
||||
'all elements of img_fill_val should between range [0,255]. '\
|
||||
f'got {img_fill_val}.'
|
||||
assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
|
||||
f'got {prob}.'
|
||||
assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\
|
||||
f'should be type int or float. got type {type(max_rotate_angle)}.'
|
||||
self.level = level
|
||||
self.scale = scale
|
||||
# Rotation angle in degrees. Positive values mean
|
||||
# clockwise rotation.
|
||||
self.angle = level_to_value(level, max_rotate_angle)
|
||||
self.center = center
|
||||
self.img_fill_val = img_fill_val
|
||||
self.seg_ignore_label = seg_ignore_label
|
||||
self.prob = prob
|
||||
self.max_rotate_angle = max_rotate_angle
|
||||
self.random_negative_prob = random_negative_prob
|
||||
|
||||
def _rotate_img(self, results, angle, center=None, scale=1.0):
|
||||
"""Rotate the image.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from loading pipeline.
|
||||
angle (float): Rotation angle in degrees, positive values
|
||||
mean clockwise rotation. Same in ``mmcv.imrotate``.
|
||||
center (tuple[float], optional): Center point (w, h) of the
|
||||
rotation. Same in ``mmcv.imrotate``.
|
||||
scale (int | float): Isotropic scale factor. Same in
|
||||
``mmcv.imrotate``.
|
||||
"""
|
||||
for key in results.get('img_fields', ['img']):
|
||||
img = results[key].copy()
|
||||
img_rotated = mmcv.imrotate(
|
||||
img, angle, center, scale, border_value=self.img_fill_val)
|
||||
results[key] = img_rotated.astype(img.dtype)
|
||||
results['img_shape'] = results[key].shape
|
||||
|
||||
def _rotate_bboxes(self, results, rotate_matrix):
|
||||
"""Rotate the bboxes."""
|
||||
h, w, c = results['img_shape']
|
||||
for key in results.get('bbox_fields', []):
|
||||
min_x, min_y, max_x, max_y = np.split(
|
||||
results[key], results[key].shape[-1], axis=-1)
|
||||
coordinates = np.stack([[min_x, min_y], [max_x, min_y],
|
||||
[min_x, max_y],
|
||||
[max_x, max_y]]) # [4, 2, nb_bbox, 1]
|
||||
# pad 1 to convert from format [x, y] to homogeneous
|
||||
# coordinates format [x, y, 1]
|
||||
coordinates = np.concatenate(
|
||||
(coordinates,
|
||||
np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)),
|
||||
axis=1) # [4, 3, nb_bbox, 1]
|
||||
coordinates = coordinates.transpose(
|
||||
(2, 0, 1, 3)) # [nb_bbox, 4, 3, 1]
|
||||
rotated_coords = np.matmul(rotate_matrix,
|
||||
coordinates) # [nb_bbox, 4, 2, 1]
|
||||
rotated_coords = rotated_coords[..., 0] # [nb_bbox, 4, 2]
|
||||
min_x, min_y = np.min(
|
||||
rotated_coords[:, :, 0], axis=1), np.min(
|
||||
rotated_coords[:, :, 1], axis=1)
|
||||
max_x, max_y = np.max(
|
||||
rotated_coords[:, :, 0], axis=1), np.max(
|
||||
rotated_coords[:, :, 1], axis=1)
|
||||
results[key] = np.stack([min_x, min_y, max_x, max_y],
|
||||
axis=-1).astype(results[key].dtype)
|
||||
|
||||
def _rotate_keypoints90(self, results, angle):
|
||||
"""Rotate the keypoints, only valid when angle in [-90,90,-180,180]"""
|
||||
if angle not in [-90, 90, 180, -180
|
||||
] or self.scale != 1 or self.center is not None:
|
||||
return
|
||||
for key in results.get('keypoints_fields', []):
|
||||
k = results[key]
|
||||
if angle == 90:
|
||||
w, h, c = results['img'].shape
|
||||
new = np.stack([h - k[..., 1], k[..., 0], k[..., 2]], axis=-1)
|
||||
elif angle == -90:
|
||||
w, h, c = results['img'].shape
|
||||
new = np.stack([k[..., 1], w - k[..., 0], k[..., 2]], axis=-1)
|
||||
else:
|
||||
h, w, c = results['img'].shape
|
||||
new = np.stack([w - k[..., 0], h - k[..., 1], k[..., 2]],
|
||||
axis=-1)
|
||||
# a kps is invalid if thrid value is -1
|
||||
kps_invalid = new[..., -1][:, -1] == -1
|
||||
new[kps_invalid] = np.zeros(new.shape[1:]) - 1
|
||||
results[key] = new
|
||||
|
||||
def _rotate_masks(self,
|
||||
results,
|
||||
angle,
|
||||
center=None,
|
||||
scale=1.0,
|
||||
fill_val=0):
|
||||
"""Rotate the masks."""
|
||||
h, w, c = results['img_shape']
|
||||
for key in results.get('mask_fields', []):
|
||||
masks = results[key]
|
||||
results[key] = masks.rotate((h, w), angle, center, scale, fill_val)
|
||||
|
||||
def _rotate_seg(self,
|
||||
results,
|
||||
angle,
|
||||
center=None,
|
||||
scale=1.0,
|
||||
fill_val=255):
|
||||
"""Rotate the segmentation map."""
|
||||
for key in results.get('seg_fields', []):
|
||||
seg = results[key].copy()
|
||||
results[key] = mmcv.imrotate(
|
||||
seg, angle, center, scale,
|
||||
border_value=fill_val).astype(seg.dtype)
|
||||
|
||||
def _filter_invalid(self, results, min_bbox_size=0):
|
||||
"""Filter bboxes and corresponding masks too small after rotate
|
||||
augmentation."""
|
||||
bbox2label, bbox2mask, _ = bbox2fields()
|
||||
for key in results.get('bbox_fields', []):
|
||||
bbox_w = results[key][:, 2] - results[key][:, 0]
|
||||
bbox_h = results[key][:, 3] - results[key][:, 1]
|
||||
valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
|
||||
valid_inds = np.nonzero(valid_inds)[0]
|
||||
results[key] = results[key][valid_inds]
|
||||
# label fields. e.g. gt_labels and gt_labels_ignore
|
||||
label_key = bbox2label.get(key)
|
||||
if label_key in results:
|
||||
results[label_key] = results[label_key][valid_inds]
|
||||
# mask fields, e.g. gt_masks and gt_masks_ignore
|
||||
mask_key = bbox2mask.get(key)
|
||||
if mask_key in results:
|
||||
results[mask_key] = results[mask_key][valid_inds]
|
||||
|
||||
def __call__(self, results):
|
||||
"""Call function to rotate images, bounding boxes, masks and semantic
|
||||
segmentation maps.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from loading pipeline.
|
||||
|
||||
Returns:
|
||||
dict: Rotated results.
|
||||
"""
|
||||
if np.random.rand() > self.prob:
|
||||
return results
|
||||
h, w = results['img'].shape[:2]
|
||||
center = self.center
|
||||
if center is None:
|
||||
center = ((w - 1) * 0.5, (h - 1) * 0.5)
|
||||
angle = random_negative(self.angle, self.random_negative_prob)
|
||||
self._rotate_img(results, angle, center, self.scale)
|
||||
rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale)
|
||||
self._rotate_bboxes(results, rotate_matrix)
|
||||
self._rotate_keypoints90(results, angle)
|
||||
self._rotate_masks(results, angle, center, self.scale, fill_val=0)
|
||||
self._rotate_seg(
|
||||
results, angle, center, self.scale, fill_val=self.seg_ignore_label)
|
||||
self._filter_invalid(results)
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = self.__class__.__name__
|
||||
repr_str += f'(level={self.level}, '
|
||||
repr_str += f'scale={self.scale}, '
|
||||
repr_str += f'center={self.center}, '
|
||||
repr_str += f'img_fill_val={self.img_fill_val}, '
|
||||
repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
|
||||
repr_str += f'prob={self.prob}, '
|
||||
repr_str += f'max_rotate_angle={self.max_rotate_angle}, '
|
||||
repr_str += f'random_negative_prob={self.random_negative_prob})'
|
||||
return repr_str
|
||||
@@ -0,0 +1,113 @@
|
||||
"""
|
||||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
|
||||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/formating.py
|
||||
"""
|
||||
import numpy as np
|
||||
import torch
|
||||
from mmcv.parallel import DataContainer as DC
|
||||
from mmdet.datasets.builder import PIPELINES
|
||||
|
||||
|
||||
def to_tensor(data):
|
||||
"""Convert objects of various python types to :obj:`torch.Tensor`.
|
||||
|
||||
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
|
||||
:class:`Sequence`, :class:`int` and :class:`float`.
|
||||
|
||||
Args:
|
||||
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
|
||||
be converted.
|
||||
"""
|
||||
|
||||
if isinstance(data, torch.Tensor):
|
||||
return data
|
||||
elif isinstance(data, np.ndarray):
|
||||
return torch.from_numpy(data)
|
||||
elif isinstance(data, Sequence) and not mmcv.is_str(data):
|
||||
return torch.tensor(data)
|
||||
elif isinstance(data, int):
|
||||
return torch.LongTensor([data])
|
||||
elif isinstance(data, float):
|
||||
return torch.FloatTensor([data])
|
||||
else:
|
||||
raise TypeError(f'type {type(data)} cannot be converted to tensor.')
|
||||
|
||||
|
||||
@PIPELINES.register_module()
|
||||
class DefaultFormatBundleV2(object):
|
||||
"""Default formatting bundle.
|
||||
|
||||
It simplifies the pipeline of formatting common fields, including "img",
|
||||
"proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
|
||||
These fields are formatted as follows.
|
||||
|
||||
- img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
|
||||
- proposals: (1)to tensor, (2)to DataContainer
|
||||
- gt_bboxes: (1)to tensor, (2)to DataContainer
|
||||
- gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
|
||||
- gt_labels: (1)to tensor, (2)to DataContainer
|
||||
- gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
|
||||
- gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
|
||||
(3)to DataContainer (stack=True)
|
||||
"""
|
||||
|
||||
def __call__(self, results):
|
||||
"""Call function to transform and format common fields in results.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict contains the data to convert.
|
||||
|
||||
Returns:
|
||||
dict: The result dict contains the data that is formatted with \
|
||||
default bundle.
|
||||
"""
|
||||
|
||||
if 'img' in results:
|
||||
img = results['img']
|
||||
# add default meta keys
|
||||
results = self._add_default_meta_keys(results)
|
||||
if len(img.shape) < 3:
|
||||
img = np.expand_dims(img, -1)
|
||||
img = np.ascontiguousarray(img.transpose(2, 0, 1))
|
||||
results['img'] = DC(to_tensor(img), stack=True)
|
||||
for key in [
|
||||
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss',
|
||||
'gt_labels'
|
||||
]:
|
||||
if key not in results:
|
||||
continue
|
||||
results[key] = DC(to_tensor(results[key]))
|
||||
if 'gt_masks' in results:
|
||||
results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
|
||||
if 'gt_semantic_seg' in results:
|
||||
results['gt_semantic_seg'] = DC(
|
||||
to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
|
||||
return results
|
||||
|
||||
def _add_default_meta_keys(self, results):
|
||||
"""Add default meta keys.
|
||||
|
||||
We set default meta keys including `pad_shape`, `scale_factor` and
|
||||
`img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
|
||||
`Pad` are implemented during the whole pipeline.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict contains the data to convert.
|
||||
|
||||
Returns:
|
||||
results (dict): Updated result dict contains the data to convert.
|
||||
"""
|
||||
img = results['img']
|
||||
results.setdefault('pad_shape', img.shape)
|
||||
results.setdefault('scale_factor', 1.0)
|
||||
num_channels = 1 if len(img.shape) < 3 else img.shape[2]
|
||||
results.setdefault(
|
||||
'img_norm_cfg',
|
||||
dict(
|
||||
mean=np.zeros(num_channels, dtype=np.float32),
|
||||
std=np.ones(num_channels, dtype=np.float32),
|
||||
to_rgb=False))
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
return self.__class__.__name__
|
||||
@@ -0,0 +1,225 @@
|
||||
"""
|
||||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
|
||||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/loading.py
|
||||
"""
|
||||
import os.path as osp
|
||||
|
||||
import numpy as np
|
||||
import pycocotools.mask as maskUtils
|
||||
from mmdet.core import BitmapMasks, PolygonMasks
|
||||
from mmdet.datasets.builder import PIPELINES
|
||||
|
||||
|
||||
@PIPELINES.register_module()
|
||||
class LoadAnnotationsV2(object):
|
||||
"""Load mutiple types of annotations.
|
||||
|
||||
Args:
|
||||
with_bbox (bool): Whether to parse and load the bbox annotation.
|
||||
Default: True.
|
||||
with_label (bool): Whether to parse and load the label annotation.
|
||||
Default: True.
|
||||
with_keypoints (bool): Whether to parse and load the keypoints annotation.
|
||||
Default: False.
|
||||
with_mask (bool): Whether to parse and load the mask annotation.
|
||||
Default: False.
|
||||
with_seg (bool): Whether to parse and load the semantic segmentation
|
||||
annotation. Default: False.
|
||||
poly2mask (bool): Whether to convert the instance masks from polygons
|
||||
to bitmaps. Default: True.
|
||||
file_client_args (dict): Arguments to instantiate a FileClient.
|
||||
See :class:`mmcv.fileio.FileClient` for details.
|
||||
Defaults to ``dict(backend='disk')``.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
with_bbox=True,
|
||||
with_label=True,
|
||||
with_keypoints=False,
|
||||
with_mask=False,
|
||||
with_seg=False,
|
||||
poly2mask=True,
|
||||
file_client_args=dict(backend='disk')):
|
||||
self.with_bbox = with_bbox
|
||||
self.with_label = with_label
|
||||
self.with_keypoints = with_keypoints
|
||||
self.with_mask = with_mask
|
||||
self.with_seg = with_seg
|
||||
self.poly2mask = poly2mask
|
||||
self.file_client_args = file_client_args.copy()
|
||||
self.file_client = None
|
||||
|
||||
def _load_bboxes(self, results):
|
||||
"""Private function to load bounding box annotations.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
|
||||
|
||||
Returns:
|
||||
dict: The dict contains loaded bounding box annotations.
|
||||
"""
|
||||
|
||||
ann_info = results['ann_info']
|
||||
results['gt_bboxes'] = ann_info['bboxes'].copy()
|
||||
|
||||
gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
|
||||
if gt_bboxes_ignore is not None:
|
||||
results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
|
||||
results['bbox_fields'].append('gt_bboxes_ignore')
|
||||
results['bbox_fields'].append('gt_bboxes')
|
||||
return results
|
||||
|
||||
def _load_keypoints(self, results):
|
||||
"""Private function to load bounding box annotations.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
|
||||
|
||||
Returns:
|
||||
dict: The dict contains loaded bounding box annotations.
|
||||
"""
|
||||
|
||||
ann_info = results['ann_info']
|
||||
results['gt_keypointss'] = ann_info['keypointss'].copy()
|
||||
|
||||
results['keypoints_fields'] = ['gt_keypointss']
|
||||
return results
|
||||
|
||||
def _load_labels(self, results):
|
||||
"""Private function to load label annotations.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
|
||||
|
||||
Returns:
|
||||
dict: The dict contains loaded label annotations.
|
||||
"""
|
||||
|
||||
results['gt_labels'] = results['ann_info']['labels'].copy()
|
||||
return results
|
||||
|
||||
def _poly2mask(self, mask_ann, img_h, img_w):
|
||||
"""Private function to convert masks represented with polygon to
|
||||
bitmaps.
|
||||
|
||||
Args:
|
||||
mask_ann (list | dict): Polygon mask annotation input.
|
||||
img_h (int): The height of output mask.
|
||||
img_w (int): The width of output mask.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
|
||||
"""
|
||||
|
||||
if isinstance(mask_ann, list):
|
||||
# polygon -- a single object might consist of multiple parts
|
||||
# we merge all parts into one mask rle code
|
||||
rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
|
||||
rle = maskUtils.merge(rles)
|
||||
elif isinstance(mask_ann['counts'], list):
|
||||
# uncompressed RLE
|
||||
rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
|
||||
else:
|
||||
# rle
|
||||
rle = mask_ann
|
||||
mask = maskUtils.decode(rle)
|
||||
return mask
|
||||
|
||||
def process_polygons(self, polygons):
|
||||
"""Convert polygons to list of ndarray and filter invalid polygons.
|
||||
|
||||
Args:
|
||||
polygons (list[list]): Polygons of one instance.
|
||||
|
||||
Returns:
|
||||
list[numpy.ndarray]: Processed polygons.
|
||||
"""
|
||||
|
||||
polygons = [np.array(p) for p in polygons]
|
||||
valid_polygons = []
|
||||
for polygon in polygons:
|
||||
if len(polygon) % 2 == 0 and len(polygon) >= 6:
|
||||
valid_polygons.append(polygon)
|
||||
return valid_polygons
|
||||
|
||||
def _load_masks(self, results):
|
||||
"""Private function to load mask annotations.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
|
||||
|
||||
Returns:
|
||||
dict: The dict contains loaded mask annotations.
|
||||
If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
|
||||
:obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
|
||||
"""
|
||||
|
||||
h, w = results['img_info']['height'], results['img_info']['width']
|
||||
gt_masks = results['ann_info']['masks']
|
||||
if self.poly2mask:
|
||||
gt_masks = BitmapMasks(
|
||||
[self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
|
||||
else:
|
||||
gt_masks = PolygonMasks(
|
||||
[self.process_polygons(polygons) for polygons in gt_masks], h,
|
||||
w)
|
||||
results['gt_masks'] = gt_masks
|
||||
results['mask_fields'].append('gt_masks')
|
||||
return results
|
||||
|
||||
def _load_semantic_seg(self, results):
|
||||
"""Private function to load semantic segmentation annotations.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from :obj:`dataset`.
|
||||
|
||||
Returns:
|
||||
dict: The dict contains loaded semantic segmentation annotations.
|
||||
"""
|
||||
import mmcv
|
||||
if self.file_client is None:
|
||||
self.file_client = mmcv.FileClient(**self.file_client_args)
|
||||
|
||||
filename = osp.join(results['seg_prefix'],
|
||||
results['ann_info']['seg_map'])
|
||||
img_bytes = self.file_client.get(filename)
|
||||
results['gt_semantic_seg'] = mmcv.imfrombytes(
|
||||
img_bytes, flag='unchanged').squeeze()
|
||||
results['seg_fields'].append('gt_semantic_seg')
|
||||
return results
|
||||
|
||||
def __call__(self, results):
|
||||
"""Call function to load multiple types annotations.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
|
||||
|
||||
Returns:
|
||||
dict: The dict contains loaded bounding box, label, mask and
|
||||
semantic segmentation annotations.
|
||||
"""
|
||||
|
||||
if self.with_bbox:
|
||||
results = self._load_bboxes(results)
|
||||
if results is None:
|
||||
return None
|
||||
if self.with_label:
|
||||
results = self._load_labels(results)
|
||||
if self.with_keypoints:
|
||||
results = self._load_keypoints(results)
|
||||
if self.with_mask:
|
||||
results = self._load_masks(results)
|
||||
if self.with_seg:
|
||||
results = self._load_semantic_seg(results)
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = self.__class__.__name__
|
||||
repr_str += f'(with_bbox={self.with_bbox}, '
|
||||
repr_str += f'with_label={self.with_label}, '
|
||||
repr_str += f'with_keypoints={self.with_keypoints}, '
|
||||
repr_str += f'with_mask={self.with_mask}, '
|
||||
repr_str += f'with_seg={self.with_seg})'
|
||||
repr_str += f'poly2mask={self.poly2mask})'
|
||||
repr_str += f'poly2mask={self.file_client_args})'
|
||||
return repr_str
|
||||
@@ -0,0 +1,737 @@
|
||||
"""
|
||||
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
|
||||
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
|
||||
"""
|
||||
import mmcv
|
||||
import numpy as np
|
||||
from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
|
||||
from mmdet.datasets.builder import PIPELINES
|
||||
from numpy import random
|
||||
|
||||
|
||||
@PIPELINES.register_module()
|
||||
class ResizeV2(object):
|
||||
"""Resize images & bbox & mask &kps.
|
||||
|
||||
This transform resizes the input image to some scale. Bboxes and masks are
|
||||
then resized with the same scale factor. If the input dict contains the key
|
||||
"scale", then the scale in the input dict is used, otherwise the specified
|
||||
scale in the init method is used. If the input dict contains the key
|
||||
"scale_factor" (if MultiScaleFlipAug does not give img_scale but
|
||||
scale_factor), the actual scale will be computed by image shape and
|
||||
scale_factor.
|
||||
|
||||
`img_scale` can either be a tuple (single-scale) or a list of tuple
|
||||
(multi-scale). There are 3 multiscale modes:
|
||||
|
||||
- ``ratio_range is not None``: randomly sample a ratio from the ratio \
|
||||
range and multiply it with the image scale.
|
||||
- ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
|
||||
sample a scale from the multiscale range.
|
||||
- ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
|
||||
sample a scale from multiple scales.
|
||||
|
||||
Args:
|
||||
img_scale (tuple or list[tuple]): Images scales for resizing.
|
||||
multiscale_mode (str): Either "range" or "value".
|
||||
ratio_range (tuple[float]): (min_ratio, max_ratio)
|
||||
keep_ratio (bool): Whether to keep the aspect ratio when resizing the
|
||||
image.
|
||||
bbox_clip_border (bool, optional): Whether clip the objects outside
|
||||
the border of the image. Defaults to True.
|
||||
backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
|
||||
These two backends generates slightly different results. Defaults
|
||||
to 'cv2'.
|
||||
override (bool, optional): Whether to override `scale` and
|
||||
`scale_factor` so as to call resize twice. Default False. If True,
|
||||
after the first resizing, the existed `scale` and `scale_factor`
|
||||
will be ignored so the second resizing can be allowed.
|
||||
This option is a work-around for multiple times of resize in DETR.
|
||||
Defaults to False.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
img_scale=None,
|
||||
multiscale_mode='range',
|
||||
ratio_range=None,
|
||||
keep_ratio=True,
|
||||
bbox_clip_border=True,
|
||||
backend='cv2',
|
||||
override=False):
|
||||
if img_scale is None:
|
||||
self.img_scale = None
|
||||
else:
|
||||
if isinstance(img_scale, list):
|
||||
self.img_scale = img_scale
|
||||
else:
|
||||
self.img_scale = [img_scale]
|
||||
assert mmcv.is_list_of(self.img_scale, tuple)
|
||||
|
||||
if ratio_range is not None:
|
||||
# mode 1: given a scale and a range of image ratio
|
||||
assert len(self.img_scale) == 1
|
||||
else:
|
||||
# mode 2: given multiple scales or a range of scales
|
||||
assert multiscale_mode in ['value', 'range']
|
||||
|
||||
self.backend = backend
|
||||
self.multiscale_mode = multiscale_mode
|
||||
self.ratio_range = ratio_range
|
||||
self.keep_ratio = keep_ratio
|
||||
# TODO: refactor the override option in Resize
|
||||
self.override = override
|
||||
self.bbox_clip_border = bbox_clip_border
|
||||
|
||||
@staticmethod
|
||||
def random_select(img_scales):
|
||||
"""Randomly select an img_scale from given candidates.
|
||||
|
||||
Args:
|
||||
img_scales (list[tuple]): Images scales for selection.
|
||||
|
||||
Returns:
|
||||
(tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
|
||||
where ``img_scale`` is the selected image scale and \
|
||||
``scale_idx`` is the selected index in the given candidates.
|
||||
"""
|
||||
|
||||
assert mmcv.is_list_of(img_scales, tuple)
|
||||
scale_idx = np.random.randint(len(img_scales))
|
||||
img_scale = img_scales[scale_idx]
|
||||
return img_scale, scale_idx
|
||||
|
||||
@staticmethod
|
||||
def random_sample(img_scales):
|
||||
"""Randomly sample an img_scale when ``multiscale_mode=='range'``.
|
||||
|
||||
Args:
|
||||
img_scales (list[tuple]): Images scale range for sampling.
|
||||
There must be two tuples in img_scales, which specify the lower
|
||||
and uper bound of image scales.
|
||||
|
||||
Returns:
|
||||
(tuple, None): Returns a tuple ``(img_scale, None)``, where \
|
||||
``img_scale`` is sampled scale and None is just a placeholder \
|
||||
to be consistent with :func:`random_select`.
|
||||
"""
|
||||
|
||||
assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
|
||||
img_scale_long = [max(s) for s in img_scales]
|
||||
img_scale_short = [min(s) for s in img_scales]
|
||||
long_edge = np.random.randint(
|
||||
min(img_scale_long),
|
||||
max(img_scale_long) + 1)
|
||||
short_edge = np.random.randint(
|
||||
min(img_scale_short),
|
||||
max(img_scale_short) + 1)
|
||||
img_scale = (long_edge, short_edge)
|
||||
return img_scale, None
|
||||
|
||||
@staticmethod
|
||||
def random_sample_ratio(img_scale, ratio_range):
|
||||
"""Randomly sample an img_scale when ``ratio_range`` is specified.
|
||||
|
||||
A ratio will be randomly sampled from the range specified by
|
||||
``ratio_range``. Then it would be multiplied with ``img_scale`` to
|
||||
generate sampled scale.
|
||||
|
||||
Args:
|
||||
img_scale (tuple): Images scale base to multiply with ratio.
|
||||
ratio_range (tuple[float]): The minimum and maximum ratio to scale
|
||||
the ``img_scale``.
|
||||
|
||||
Returns:
|
||||
(tuple, None): Returns a tuple ``(scale, None)``, where \
|
||||
``scale`` is sampled ratio multiplied with ``img_scale`` and \
|
||||
None is just a placeholder to be consistent with \
|
||||
:func:`random_select`.
|
||||
"""
|
||||
|
||||
assert isinstance(img_scale, tuple) and len(img_scale) == 2
|
||||
min_ratio, max_ratio = ratio_range
|
||||
assert min_ratio <= max_ratio
|
||||
ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
|
||||
scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
|
||||
return scale, None
|
||||
|
||||
def _random_scale(self, results):
|
||||
"""Randomly sample an img_scale according to ``ratio_range`` and
|
||||
``multiscale_mode``.
|
||||
|
||||
If ``ratio_range`` is specified, a ratio will be sampled and be
|
||||
multiplied with ``img_scale``.
|
||||
If multiple scales are specified by ``img_scale``, a scale will be
|
||||
sampled according to ``multiscale_mode``.
|
||||
Otherwise, single scale will be used.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from :obj:`dataset`.
|
||||
|
||||
Returns:
|
||||
dict: Two new keys 'scale` and 'scale_idx` are added into \
|
||||
``results``, which would be used by subsequent pipelines.
|
||||
"""
|
||||
|
||||
if self.ratio_range is not None:
|
||||
scale, scale_idx = self.random_sample_ratio(
|
||||
self.img_scale[0], self.ratio_range)
|
||||
elif len(self.img_scale) == 1:
|
||||
scale, scale_idx = self.img_scale[0], 0
|
||||
elif self.multiscale_mode == 'range':
|
||||
scale, scale_idx = self.random_sample(self.img_scale)
|
||||
elif self.multiscale_mode == 'value':
|
||||
scale, scale_idx = self.random_select(self.img_scale)
|
||||
else:
|
||||
raise NotImplementedError
|
||||
|
||||
results['scale'] = scale
|
||||
results['scale_idx'] = scale_idx
|
||||
|
||||
def _resize_img(self, results):
|
||||
"""Resize images with ``results['scale']``."""
|
||||
for key in results.get('img_fields', ['img']):
|
||||
if self.keep_ratio:
|
||||
img, scale_factor = mmcv.imrescale(
|
||||
results[key],
|
||||
results['scale'],
|
||||
return_scale=True,
|
||||
backend=self.backend)
|
||||
# the w_scale and h_scale has minor difference
|
||||
# a real fix should be done in the mmcv.imrescale in the future
|
||||
new_h, new_w = img.shape[:2]
|
||||
h, w = results[key].shape[:2]
|
||||
w_scale = new_w / w
|
||||
h_scale = new_h / h
|
||||
else:
|
||||
img, w_scale, h_scale = mmcv.imresize(
|
||||
results[key],
|
||||
results['scale'],
|
||||
return_scale=True,
|
||||
backend=self.backend)
|
||||
results[key] = img
|
||||
|
||||
scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
|
||||
dtype=np.float32)
|
||||
results['img_shape'] = img.shape
|
||||
# in case that there is no padding
|
||||
results['pad_shape'] = img.shape
|
||||
results['scale_factor'] = scale_factor
|
||||
results['keep_ratio'] = self.keep_ratio
|
||||
|
||||
def _resize_bboxes(self, results):
|
||||
"""Resize bounding boxes with ``results['scale_factor']``."""
|
||||
for key in results.get('bbox_fields', []):
|
||||
bboxes = results[key] * results['scale_factor']
|
||||
if self.bbox_clip_border:
|
||||
img_shape = results['img_shape']
|
||||
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
|
||||
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
|
||||
results[key] = bboxes
|
||||
|
||||
def _resize_keypoints(self, results):
|
||||
"""Resize keypoints with ``results['scale_factor']``."""
|
||||
for key in results.get('keypoints_fields', []):
|
||||
keypointss = results[key].copy()
|
||||
factors = results['scale_factor']
|
||||
assert factors[0] == factors[2]
|
||||
assert factors[1] == factors[3]
|
||||
keypointss[:, :, 0] *= factors[0]
|
||||
keypointss[:, :, 1] *= factors[1]
|
||||
if self.bbox_clip_border:
|
||||
img_shape = results['img_shape']
|
||||
keypointss[:, :, 0] = np.clip(keypointss[:, :, 0], 0,
|
||||
img_shape[1])
|
||||
keypointss[:, :, 1] = np.clip(keypointss[:, :, 1], 0,
|
||||
img_shape[0])
|
||||
results[key] = keypointss
|
||||
|
||||
def _resize_masks(self, results):
|
||||
"""Resize masks with ``results['scale']``"""
|
||||
for key in results.get('mask_fields', []):
|
||||
if results[key] is None:
|
||||
continue
|
||||
if self.keep_ratio:
|
||||
results[key] = results[key].rescale(results['scale'])
|
||||
else:
|
||||
results[key] = results[key].resize(results['img_shape'][:2])
|
||||
|
||||
def _resize_seg(self, results):
|
||||
"""Resize semantic segmentation map with ``results['scale']``."""
|
||||
for key in results.get('seg_fields', []):
|
||||
if self.keep_ratio:
|
||||
gt_seg = mmcv.imrescale(
|
||||
results[key],
|
||||
results['scale'],
|
||||
interpolation='nearest',
|
||||
backend=self.backend)
|
||||
else:
|
||||
gt_seg = mmcv.imresize(
|
||||
results[key],
|
||||
results['scale'],
|
||||
interpolation='nearest',
|
||||
backend=self.backend)
|
||||
results['gt_semantic_seg'] = gt_seg
|
||||
|
||||
def __call__(self, results):
|
||||
"""Call function to resize images, bounding boxes, masks, semantic
|
||||
segmentation map.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from loading pipeline.
|
||||
|
||||
Returns:
|
||||
dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
|
||||
'keep_ratio' keys are added into result dict.
|
||||
"""
|
||||
|
||||
if 'scale' not in results:
|
||||
if 'scale_factor' in results:
|
||||
img_shape = results['img'].shape[:2]
|
||||
scale_factor = results['scale_factor']
|
||||
assert isinstance(scale_factor, float)
|
||||
results['scale'] = tuple(
|
||||
[int(x * scale_factor) for x in img_shape][::-1])
|
||||
else:
|
||||
self._random_scale(results)
|
||||
else:
|
||||
if not self.override:
|
||||
assert 'scale_factor' not in results, (
|
||||
'scale and scale_factor cannot be both set.')
|
||||
else:
|
||||
results.pop('scale')
|
||||
if 'scale_factor' in results:
|
||||
results.pop('scale_factor')
|
||||
self._random_scale(results)
|
||||
|
||||
self._resize_img(results)
|
||||
self._resize_bboxes(results)
|
||||
self._resize_keypoints(results)
|
||||
self._resize_masks(results)
|
||||
self._resize_seg(results)
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = self.__class__.__name__
|
||||
repr_str += f'(img_scale={self.img_scale}, '
|
||||
repr_str += f'multiscale_mode={self.multiscale_mode}, '
|
||||
repr_str += f'ratio_range={self.ratio_range}, '
|
||||
repr_str += f'keep_ratio={self.keep_ratio})'
|
||||
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
|
||||
return repr_str
|
||||
|
||||
|
||||
@PIPELINES.register_module()
|
||||
class RandomFlipV2(object):
|
||||
"""Flip the image & bbox & mask & kps.
|
||||
|
||||
If the input dict contains the key "flip", then the flag will be used,
|
||||
otherwise it will be randomly decided by a ratio specified in the init
|
||||
method.
|
||||
|
||||
When random flip is enabled, ``flip_ratio``/``direction`` can either be a
|
||||
float/string or tuple of float/string. There are 3 flip modes:
|
||||
|
||||
- ``flip_ratio`` is float, ``direction`` is string: the image will be
|
||||
``direction``ly flipped with probability of ``flip_ratio`` .
|
||||
E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
|
||||
then image will be horizontally flipped with probability of 0.5.
|
||||
- ``flip_ratio`` is float, ``direction`` is list of string: the image wil
|
||||
be ``direction[i]``ly flipped with probability of
|
||||
``flip_ratio/len(direction)``.
|
||||
E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
|
||||
then image will be horizontally flipped with probability of 0.25,
|
||||
vertically with probability of 0.25.
|
||||
- ``flip_ratio`` is list of float, ``direction`` is list of string:
|
||||
given ``len(flip_ratio) == len(direction)``, the image wil
|
||||
be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
|
||||
E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
|
||||
'vertical']``, then image will be horizontally flipped with probability
|
||||
of 0.3, vertically with probability of 0.5
|
||||
|
||||
Args:
|
||||
flip_ratio (float | list[float], optional): The flipping probability.
|
||||
Default: None.
|
||||
direction(str | list[str], optional): The flipping direction. Options
|
||||
are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
|
||||
If input is a list, the length must equal ``flip_ratio``. Each
|
||||
element in ``flip_ratio`` indicates the flip probability of
|
||||
corresponding direction.
|
||||
"""
|
||||
|
||||
def __init__(self, flip_ratio=None, direction='horizontal'):
|
||||
if isinstance(flip_ratio, list):
|
||||
assert mmcv.is_list_of(flip_ratio, float)
|
||||
assert 0 <= sum(flip_ratio) <= 1
|
||||
elif isinstance(flip_ratio, float):
|
||||
assert 0 <= flip_ratio <= 1
|
||||
elif flip_ratio is None:
|
||||
pass
|
||||
else:
|
||||
raise ValueError('flip_ratios must be None, float, '
|
||||
'or list of float')
|
||||
self.flip_ratio = flip_ratio
|
||||
|
||||
valid_directions = ['horizontal', 'vertical', 'diagonal']
|
||||
if isinstance(direction, str):
|
||||
assert direction in valid_directions
|
||||
elif isinstance(direction, list):
|
||||
assert mmcv.is_list_of(direction, str)
|
||||
assert set(direction).issubset(set(valid_directions))
|
||||
else:
|
||||
raise ValueError('direction must be either str or list of str')
|
||||
self.direction = direction
|
||||
|
||||
if isinstance(flip_ratio, list):
|
||||
assert len(self.flip_ratio) == len(self.direction)
|
||||
self.count = 0
|
||||
|
||||
def bbox_flip(self, bboxes, img_shape, direction):
|
||||
"""Flip bboxes horizontally.
|
||||
|
||||
Args:
|
||||
bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
|
||||
img_shape (tuple[int]): Image shape (height, width)
|
||||
direction (str): Flip direction. Options are 'horizontal',
|
||||
'vertical'.
|
||||
|
||||
Returns:
|
||||
numpy.ndarray: Flipped bounding boxes.
|
||||
"""
|
||||
|
||||
assert bboxes.shape[-1] % 4 == 0
|
||||
flipped = bboxes.copy()
|
||||
if direction == 'horizontal':
|
||||
w = img_shape[1]
|
||||
flipped[..., 0::4] = w - bboxes[..., 2::4]
|
||||
flipped[..., 2::4] = w - bboxes[..., 0::4]
|
||||
elif direction == 'vertical':
|
||||
h = img_shape[0]
|
||||
flipped[..., 1::4] = h - bboxes[..., 3::4]
|
||||
flipped[..., 3::4] = h - bboxes[..., 1::4]
|
||||
elif direction == 'diagonal':
|
||||
w = img_shape[1]
|
||||
h = img_shape[0]
|
||||
flipped[..., 0::4] = w - bboxes[..., 2::4]
|
||||
flipped[..., 1::4] = h - bboxes[..., 3::4]
|
||||
flipped[..., 2::4] = w - bboxes[..., 0::4]
|
||||
flipped[..., 3::4] = h - bboxes[..., 1::4]
|
||||
else:
|
||||
raise ValueError(f"Invalid flipping direction '{direction}'")
|
||||
return flipped
|
||||
|
||||
def keypoints_flip(self, keypointss, img_shape, direction):
|
||||
"""Flip keypoints horizontally."""
|
||||
|
||||
assert direction == 'horizontal'
|
||||
assert keypointss.shape[-1] == 3
|
||||
num_kps = keypointss.shape[1]
|
||||
assert num_kps in [4, 5], f'Only Support num_kps=4 or 5, got:{num_kps}'
|
||||
assert keypointss.ndim == 3
|
||||
flipped = keypointss.copy()
|
||||
if num_kps == 5:
|
||||
flip_order = [1, 0, 2, 4, 3]
|
||||
elif num_kps == 4:
|
||||
flip_order = [3, 2, 1, 0]
|
||||
for idx, a in enumerate(flip_order):
|
||||
flipped[:, idx, :] = keypointss[:, a, :]
|
||||
w = img_shape[1]
|
||||
flipped[..., 0] = w - flipped[..., 0]
|
||||
return flipped
|
||||
|
||||
def __call__(self, results):
|
||||
"""Call function to flip bounding boxes, masks, semantic segmentation
|
||||
maps.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from loading pipeline.
|
||||
|
||||
Returns:
|
||||
dict: Flipped results, 'flip', 'flip_direction' keys are added \
|
||||
into result dict.
|
||||
"""
|
||||
if 'flip' not in results:
|
||||
if isinstance(self.direction, list):
|
||||
# None means non-flip
|
||||
direction_list = self.direction + [None]
|
||||
else:
|
||||
# None means non-flip
|
||||
direction_list = [self.direction, None]
|
||||
|
||||
if isinstance(self.flip_ratio, list):
|
||||
non_flip_ratio = 1 - sum(self.flip_ratio)
|
||||
flip_ratio_list = self.flip_ratio + [non_flip_ratio]
|
||||
else:
|
||||
non_flip_ratio = 1 - self.flip_ratio
|
||||
# exclude non-flip
|
||||
single_ratio = self.flip_ratio / (len(direction_list) - 1)
|
||||
flip_ratio_list = [single_ratio] * (len(direction_list)
|
||||
- 1) + [non_flip_ratio]
|
||||
|
||||
cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
|
||||
|
||||
results['flip'] = cur_dir is not None
|
||||
if 'flip_direction' not in results:
|
||||
results['flip_direction'] = cur_dir
|
||||
if results['flip']:
|
||||
# flip image
|
||||
for key in results.get('img_fields', ['img']):
|
||||
results[key] = mmcv.imflip(
|
||||
results[key], direction=results['flip_direction'])
|
||||
# flip bboxes
|
||||
for key in results.get('bbox_fields', []):
|
||||
results[key] = self.bbox_flip(results[key],
|
||||
results['img_shape'],
|
||||
results['flip_direction'])
|
||||
# flip kps
|
||||
for key in results.get('keypoints_fields', []):
|
||||
results[key] = self.keypoints_flip(results[key],
|
||||
results['img_shape'],
|
||||
results['flip_direction'])
|
||||
# flip masks
|
||||
for key in results.get('mask_fields', []):
|
||||
results[key] = results[key].flip(results['flip_direction'])
|
||||
|
||||
# flip segs
|
||||
for key in results.get('seg_fields', []):
|
||||
results[key] = mmcv.imflip(
|
||||
results[key], direction=results['flip_direction'])
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
|
||||
|
||||
|
||||
@PIPELINES.register_module()
|
||||
class RandomSquareCrop(object):
|
||||
"""Random crop the image & bboxes, the cropped patches have minimum IoU
|
||||
requirement with original image & bboxes, the IoU threshold is randomly
|
||||
selected from min_ious.
|
||||
|
||||
Args:
|
||||
min_ious (tuple): minimum IoU threshold for all intersections with
|
||||
bounding boxes
|
||||
min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
|
||||
where a >= min_crop_size).
|
||||
|
||||
Note:
|
||||
The keys for bboxes, labels and masks should be paired. That is, \
|
||||
`gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
|
||||
`gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
crop_ratio_range=None,
|
||||
crop_choice=None,
|
||||
bbox_clip_border=True,
|
||||
big_face_ratio=0,
|
||||
big_face_crop_choice=None):
|
||||
|
||||
self.crop_ratio_range = crop_ratio_range
|
||||
self.crop_choice = crop_choice
|
||||
self.big_face_crop_choice = big_face_crop_choice
|
||||
self.bbox_clip_border = bbox_clip_border
|
||||
|
||||
assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
|
||||
if self.crop_ratio_range is not None:
|
||||
self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
|
||||
|
||||
self.bbox2label = {
|
||||
'gt_bboxes': 'gt_labels',
|
||||
'gt_bboxes_ignore': 'gt_labels_ignore'
|
||||
}
|
||||
self.bbox2mask = {
|
||||
'gt_bboxes': 'gt_masks',
|
||||
'gt_bboxes_ignore': 'gt_masks_ignore'
|
||||
}
|
||||
assert big_face_ratio >= 0 and big_face_ratio <= 1.0
|
||||
self.big_face_ratio = big_face_ratio
|
||||
|
||||
def __call__(self, results):
|
||||
"""Call function to crop images and bounding boxes with minimum IoU
|
||||
constraint.
|
||||
|
||||
Args:
|
||||
results (dict): Result dict from loading pipeline.
|
||||
|
||||
Returns:
|
||||
dict: Result dict with images and bounding boxes cropped, \
|
||||
'img_shape' key is updated.
|
||||
"""
|
||||
|
||||
if 'img_fields' in results:
|
||||
assert results['img_fields'] == ['img'], \
|
||||
'Only single img_fields is allowed'
|
||||
img = results['img']
|
||||
assert 'bbox_fields' in results
|
||||
assert 'gt_bboxes' in results
|
||||
# try augment big face images
|
||||
find_bigface = False
|
||||
if np.random.random() < self.big_face_ratio:
|
||||
min_size = 100 # h and w
|
||||
expand_ratio = 0.3 # expand ratio of croped face alongwith both w and h
|
||||
bbox = results['gt_bboxes'].copy()
|
||||
lmks = results['gt_keypointss'].copy()
|
||||
label = results['gt_labels'].copy()
|
||||
# filter small faces
|
||||
size_mask = ((bbox[:, 2] - bbox[:, 0]) > min_size) * (
|
||||
(bbox[:, 3] - bbox[:, 1]) > min_size)
|
||||
bbox = bbox[size_mask]
|
||||
lmks = lmks[size_mask]
|
||||
label = label[size_mask]
|
||||
# randomly choose a face that has no overlap with others
|
||||
if len(bbox) > 0:
|
||||
overlaps = bbox_overlaps(bbox, bbox)
|
||||
overlaps -= np.eye(overlaps.shape[0])
|
||||
iou_mask = np.sum(overlaps, axis=1) == 0
|
||||
bbox = bbox[iou_mask]
|
||||
lmks = lmks[iou_mask]
|
||||
label = label[iou_mask]
|
||||
if len(bbox) > 0:
|
||||
choice = np.random.randint(len(bbox))
|
||||
bbox = bbox[choice]
|
||||
lmks = lmks[choice]
|
||||
label = [label[choice]]
|
||||
w = bbox[2] - bbox[0]
|
||||
h = bbox[3] - bbox[1]
|
||||
x1 = bbox[0] - w * expand_ratio
|
||||
x2 = bbox[2] + w * expand_ratio
|
||||
y1 = bbox[1] - h * expand_ratio
|
||||
y2 = bbox[3] + h * expand_ratio
|
||||
x1, x2 = np.clip([x1, x2], 0, img.shape[1])
|
||||
y1, y2 = np.clip([y1, y2], 0, img.shape[0])
|
||||
bbox -= np.tile([x1, y1], 2)
|
||||
lmks -= (x1, y1, 0)
|
||||
|
||||
find_bigface = True
|
||||
img = img[int(y1):int(y2), int(x1):int(x2), :]
|
||||
results['gt_bboxes'] = np.expand_dims(bbox, axis=0)
|
||||
results['gt_keypointss'] = np.expand_dims(lmks, axis=0)
|
||||
results['gt_labels'] = np.array(label)
|
||||
results['img'] = img
|
||||
|
||||
boxes = results['gt_bboxes']
|
||||
h, w, c = img.shape
|
||||
|
||||
if self.crop_ratio_range is not None:
|
||||
max_scale = self.crop_ratio_max
|
||||
else:
|
||||
max_scale = np.amax(self.crop_choice)
|
||||
scale_retry = 0
|
||||
while True:
|
||||
scale_retry += 1
|
||||
if scale_retry == 1 or max_scale > 1.0:
|
||||
if self.crop_ratio_range is not None:
|
||||
scale = np.random.uniform(self.crop_ratio_min,
|
||||
self.crop_ratio_max)
|
||||
elif self.crop_choice is not None:
|
||||
scale = np.random.choice(self.crop_choice)
|
||||
else:
|
||||
scale = scale * 1.2
|
||||
|
||||
if find_bigface:
|
||||
# select a scale from big_face_crop_choice if in big_face mode
|
||||
scale = np.random.choice(self.big_face_crop_choice)
|
||||
|
||||
for i in range(250):
|
||||
long_side = max(w, h)
|
||||
cw = int(scale * long_side)
|
||||
ch = cw
|
||||
|
||||
# TODO +1
|
||||
if w == cw:
|
||||
left = 0
|
||||
elif w > cw:
|
||||
left = random.randint(0, w - cw)
|
||||
else:
|
||||
left = random.randint(w - cw, 0)
|
||||
if h == ch:
|
||||
top = 0
|
||||
elif h > ch:
|
||||
top = random.randint(0, h - ch)
|
||||
else:
|
||||
top = random.randint(h - ch, 0)
|
||||
|
||||
patch = np.array(
|
||||
(int(left), int(top), int(left + cw), int(top + ch)),
|
||||
dtype=np.int32)
|
||||
|
||||
# center of boxes should inside the crop img
|
||||
# only adjust boxes and instance masks when the gt is not empty
|
||||
# adjust boxes
|
||||
def is_center_of_bboxes_in_patch(boxes, patch):
|
||||
# TODO >=
|
||||
center = (boxes[:, :2] + boxes[:, 2:]) / 2
|
||||
mask = \
|
||||
((center[:, 0] > patch[0])
|
||||
* (center[:, 1] > patch[1])
|
||||
* (center[:, 0] < patch[2])
|
||||
* (center[:, 1] < patch[3]))
|
||||
return mask
|
||||
|
||||
mask = is_center_of_bboxes_in_patch(boxes, patch)
|
||||
if not mask.any():
|
||||
continue
|
||||
for key in results.get('bbox_fields', []):
|
||||
boxes = results[key].copy()
|
||||
mask = is_center_of_bboxes_in_patch(boxes, patch)
|
||||
boxes = boxes[mask]
|
||||
if self.bbox_clip_border:
|
||||
boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
|
||||
boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
|
||||
boxes -= np.tile(patch[:2], 2)
|
||||
|
||||
results[key] = boxes
|
||||
# labels
|
||||
label_key = self.bbox2label.get(key)
|
||||
if label_key in results:
|
||||
results[label_key] = results[label_key][mask]
|
||||
|
||||
# keypoints field
|
||||
if key == 'gt_bboxes':
|
||||
for kps_key in results.get('keypoints_fields', []):
|
||||
keypointss = results[kps_key].copy()
|
||||
keypointss = keypointss[mask, :, :]
|
||||
if self.bbox_clip_border:
|
||||
keypointss[:, :, :
|
||||
2] = keypointss[:, :, :2].clip(
|
||||
max=patch[2:])
|
||||
keypointss[:, :, :
|
||||
2] = keypointss[:, :, :2].clip(
|
||||
min=patch[:2])
|
||||
keypointss[:, :, 0] -= patch[0]
|
||||
keypointss[:, :, 1] -= patch[1]
|
||||
results[kps_key] = keypointss
|
||||
|
||||
# mask fields
|
||||
mask_key = self.bbox2mask.get(key)
|
||||
if mask_key in results:
|
||||
results[mask_key] = results[mask_key][mask.nonzero()
|
||||
[0]].crop(patch)
|
||||
|
||||
# adjust the img no matter whether the gt is empty before crop
|
||||
rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
|
||||
patch_from = patch.copy()
|
||||
patch_from[0] = max(0, patch_from[0])
|
||||
patch_from[1] = max(0, patch_from[1])
|
||||
patch_from[2] = min(img.shape[1], patch_from[2])
|
||||
patch_from[3] = min(img.shape[0], patch_from[3])
|
||||
patch_to = patch.copy()
|
||||
patch_to[0] = max(0, patch_to[0] * -1)
|
||||
patch_to[1] = max(0, patch_to[1] * -1)
|
||||
patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
|
||||
patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
|
||||
rimg[patch_to[1]:patch_to[3],
|
||||
patch_to[0]:patch_to[2], :] = img[
|
||||
patch_from[1]:patch_from[3],
|
||||
patch_from[0]:patch_from[2], :]
|
||||
img = rimg
|
||||
results['img'] = img
|
||||
results['img_shape'] = img.shape
|
||||
|
||||
return results
|
||||
|
||||
def __repr__(self):
|
||||
repr_str = self.__class__.__name__
|
||||
repr_str += f'(min_ious={self.min_iou}, '
|
||||
repr_str += f'crop_size={self.crop_size})'
|
||||
return repr_str
|
||||
@@ -13,7 +13,7 @@ class RetinaFaceDataset(CustomDataset):
|
||||
CLASSES = ('FG', )
|
||||
|
||||
def __init__(self, min_size=None, **kwargs):
|
||||
self.NK = 5
|
||||
self.NK = kwargs.pop('num_kps', 5)
|
||||
self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
|
||||
self.min_size = min_size
|
||||
self.gt_path = kwargs.get('gt_path')
|
||||
@@ -33,7 +33,8 @@ class RetinaFaceDataset(CustomDataset):
|
||||
if len(values) > 4:
|
||||
if len(values) > 5:
|
||||
kps = np.array(
|
||||
values[4:19], dtype=np.float32).reshape((self.NK, 3))
|
||||
values[4:4 + self.NK * 3], dtype=np.float32).reshape(
|
||||
(self.NK, 3))
|
||||
for li in range(kps.shape[0]):
|
||||
if (kps[li, :] == -1).all():
|
||||
kps[li][2] = 0.0 # weight = 0, ignore
|
||||
@@ -103,6 +103,7 @@ class SCRFDHead(AnchorHead):
|
||||
scale_mode=1,
|
||||
dw_conv=False,
|
||||
use_kps=False,
|
||||
num_kps=5,
|
||||
loss_kps=dict(
|
||||
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
|
||||
**kwargs):
|
||||
@@ -116,7 +117,7 @@ class SCRFDHead(AnchorHead):
|
||||
self.scale_mode = scale_mode
|
||||
self.use_dfl = True
|
||||
self.dw_conv = dw_conv
|
||||
self.NK = 5
|
||||
self.NK = num_kps
|
||||
self.extra_flops = 0.0
|
||||
if loss_dfl is None or not loss_dfl:
|
||||
self.use_dfl = False
|
||||
@@ -323,8 +324,8 @@ class SCRFDHead(AnchorHead):
|
||||
batch_size, -1, self.cls_out_channels).sigmoid()
|
||||
bbox_pred = bbox_pred.permute(0, 2, 3,
|
||||
1).reshape(batch_size, -1, 4)
|
||||
kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10)
|
||||
|
||||
kps_pred = kps_pred.permute(0, 2, 3,
|
||||
1).reshape(batch_size, -1, self.NK * 2)
|
||||
return cls_score, bbox_pred, kps_pred
|
||||
|
||||
def forward_train(self,
|
||||
@@ -788,7 +789,7 @@ class SCRFDHead(AnchorHead):
|
||||
if self.use_dfl:
|
||||
kps_pred = self.integral(kps_pred) * stride[0]
|
||||
else:
|
||||
kps_pred = kps_pred.reshape((-1, 10)) * stride[0]
|
||||
kps_pred = kps_pred.reshape((-1, self.NK * 2)) * stride[0]
|
||||
|
||||
nms_pre = cfg.get('nms_pre', -1)
|
||||
if nms_pre > 0 and scores.shape[0] > nms_pre:
|
||||
@@ -815,7 +816,7 @@ class SCRFDHead(AnchorHead):
|
||||
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
|
||||
if mlvl_kps is not None:
|
||||
scale_factor2 = torch.tensor(
|
||||
[scale_factor[0], scale_factor[1]] * 5)
|
||||
[scale_factor[0], scale_factor[1]] * self.NK)
|
||||
mlvl_kps /= scale_factor2.to(mlvl_kps.device)
|
||||
|
||||
mlvl_scores = torch.cat(mlvl_scores)
|
||||
@@ -54,7 +54,13 @@ class SCRFD(SingleStageDetector):
|
||||
gt_bboxes_ignore)
|
||||
return losses
|
||||
|
||||
def simple_test(self, img, img_metas, rescale=False):
|
||||
def simple_test(self,
|
||||
img,
|
||||
img_metas,
|
||||
rescale=False,
|
||||
repeat_head=1,
|
||||
output_kps_var=0,
|
||||
output_results=1):
|
||||
"""Test function without test time augmentation.
|
||||
|
||||
Args:
|
||||
@@ -62,6 +68,9 @@ class SCRFD(SingleStageDetector):
|
||||
img_metas (list[dict]): List of image information.
|
||||
rescale (bool, optional): Whether to rescale the results.
|
||||
Defaults to False.
|
||||
repeat_head (int): repeat inference times in head
|
||||
output_kps_var (int): whether output kps var to calculate quality
|
||||
output_results (int): 0: nothing 1: bbox 2: both bbox and kps
|
||||
|
||||
Returns:
|
||||
list[list[np.ndarray]]: BBox results of each image and classes.
|
||||
@@ -69,40 +78,71 @@ class SCRFD(SingleStageDetector):
|
||||
corresponds to each class.
|
||||
"""
|
||||
x = self.extract_feat(img)
|
||||
outs = self.bbox_head(x)
|
||||
if torch.onnx.is_in_onnx_export():
|
||||
print('single_stage.py in-onnx-export')
|
||||
print(outs.__class__)
|
||||
cls_score, bbox_pred, kps_pred = outs
|
||||
for c in cls_score:
|
||||
print(c.shape)
|
||||
for c in bbox_pred:
|
||||
print(c.shape)
|
||||
if self.bbox_head.use_kps:
|
||||
for c in kps_pred:
|
||||
print(c.shape)
|
||||
return (cls_score, bbox_pred, kps_pred)
|
||||
else:
|
||||
return (cls_score, bbox_pred)
|
||||
bbox_list = self.bbox_head.get_bboxes(
|
||||
*outs, img_metas, rescale=rescale)
|
||||
assert repeat_head >= 1
|
||||
kps_out0 = []
|
||||
kps_out1 = []
|
||||
kps_out2 = []
|
||||
for i in range(repeat_head):
|
||||
outs = self.bbox_head(x)
|
||||
kps_out0 += [outs[2][0].detach().cpu().numpy()]
|
||||
kps_out1 += [outs[2][1].detach().cpu().numpy()]
|
||||
kps_out2 += [outs[2][2].detach().cpu().numpy()]
|
||||
if output_kps_var:
|
||||
var0 = np.var(np.vstack(kps_out0), axis=0).mean()
|
||||
var1 = np.var(np.vstack(kps_out1), axis=0).mean()
|
||||
var2 = np.var(np.vstack(kps_out2), axis=0).mean()
|
||||
var = np.mean([var0, var1, var2])
|
||||
else:
|
||||
var = None
|
||||
|
||||
# return kps if use_kps
|
||||
if len(bbox_list[0]) == 2:
|
||||
bbox_results = [
|
||||
bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
|
||||
for det_bboxes, det_labels in bbox_list
|
||||
]
|
||||
elif len(bbox_list[0]) == 3:
|
||||
bbox_results = [
|
||||
bbox2result(
|
||||
det_bboxes,
|
||||
det_labels,
|
||||
self.bbox_head.num_classes,
|
||||
kps=det_kps)
|
||||
for det_bboxes, det_labels, det_kps in bbox_list
|
||||
]
|
||||
return bbox_results
|
||||
if output_results > 0:
|
||||
if torch.onnx.is_in_onnx_export():
|
||||
print('single_stage.py in-onnx-export')
|
||||
print(outs.__class__)
|
||||
cls_score, bbox_pred, kps_pred = outs
|
||||
for c in cls_score:
|
||||
print(c.shape)
|
||||
for c in bbox_pred:
|
||||
print(c.shape)
|
||||
if self.bbox_head.use_kps:
|
||||
for c in kps_pred:
|
||||
print(c.shape)
|
||||
return (cls_score, bbox_pred, kps_pred)
|
||||
else:
|
||||
return (cls_score, bbox_pred)
|
||||
bbox_list = self.bbox_head.get_bboxes(
|
||||
*outs, img_metas, rescale=rescale)
|
||||
|
||||
# return kps if use_kps
|
||||
if len(bbox_list[0]) == 2:
|
||||
bbox_results = [
|
||||
bbox2result(det_bboxes, det_labels,
|
||||
self.bbox_head.num_classes)
|
||||
for det_bboxes, det_labels in bbox_list
|
||||
]
|
||||
elif len(bbox_list[0]) == 3:
|
||||
if output_results == 2:
|
||||
bbox_results = [
|
||||
bbox2result(
|
||||
det_bboxes,
|
||||
det_labels,
|
||||
self.bbox_head.num_classes,
|
||||
kps=det_kps,
|
||||
num_kps=self.bbox_head.NK)
|
||||
for det_bboxes, det_labels, det_kps in bbox_list
|
||||
]
|
||||
elif output_results == 1:
|
||||
bbox_results = [
|
||||
bbox2result(det_bboxes, det_labels,
|
||||
self.bbox_head.num_classes)
|
||||
for det_bboxes, det_labels, _ in bbox_list
|
||||
]
|
||||
else:
|
||||
bbox_results = None
|
||||
if var is not None:
|
||||
return bbox_results, var
|
||||
else:
|
||||
return bbox_results
|
||||
|
||||
def feature_test(self, img):
|
||||
x = self.extract_feat(img)
|
||||
71
modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
Normal file
71
modelscope/models/cv/face_detection/scrfd/scrfd_detect.py
Normal file
@@ -0,0 +1,71 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os.path as osp
|
||||
from copy import deepcopy
|
||||
from typing import Any, Dict
|
||||
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.outputs import OutputKeys
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
__all__ = ['ScrfdDetect']
|
||||
|
||||
|
||||
@MODELS.register_module(Tasks.face_detection, module_name=Models.scrfd)
|
||||
class ScrfdDetect(TorchModel):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the face detection model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
from mmcv import Config
|
||||
from mmcv.parallel import MMDataParallel
|
||||
from mmcv.runner import load_checkpoint
|
||||
from mmdet.models import build_detector
|
||||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
|
||||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
|
||||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
|
||||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
|
||||
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
|
||||
cfg = Config.fromfile(osp.join(model_dir, 'mmcv_scrfd.py'))
|
||||
ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
|
||||
cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3)
|
||||
detector = build_detector(cfg.model)
|
||||
logger.info(f'loading model from {ckpt_path}')
|
||||
device = torch.device(
|
||||
f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
|
||||
load_checkpoint(detector, ckpt_path, map_location=device)
|
||||
detector = MMDataParallel(detector, device_ids=[0])
|
||||
detector.eval()
|
||||
self.detector = detector
|
||||
logger.info('load model done')
|
||||
|
||||
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
|
||||
result = self.detector(
|
||||
return_loss=False,
|
||||
rescale=True,
|
||||
img=[input['img'][0].unsqueeze(0)],
|
||||
img_metas=[[dict(input['img_metas'][0].data)]],
|
||||
output_results=2)
|
||||
assert result is not None
|
||||
result = result[0][0]
|
||||
bboxes = result[:, :4].tolist()
|
||||
kpss = result[:, 5:].tolist()
|
||||
scores = result[:, 4].tolist()
|
||||
return {
|
||||
OutputKeys.SCORES: scores,
|
||||
OutputKeys.BOXES: bboxes,
|
||||
OutputKeys.KEYPOINTS: kpss
|
||||
}
|
||||
|
||||
def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
|
||||
return input
|
||||
20
modelscope/models/cv/hand_2d_keypoints/__init__.py
Normal file
20
modelscope/models/cv/hand_2d_keypoints/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .hand_2d_keypoints import Hand2dKeyPoints
|
||||
|
||||
else:
|
||||
_import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
16
modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
Normal file
16
modelscope/models/cv/hand_2d_keypoints/hand_2d_keypoints.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.pose import TopDown
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
|
||||
class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
TopDown.__init__(self, *args, **kwargs)
|
||||
22
modelscope/models/cv/human_wholebody_keypoint/__init__.py
Normal file
22
modelscope/models/cv/human_wholebody_keypoint/__init__.py
Normal file
@@ -0,0 +1,22 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .human_wholebody_keypoint import HumanWholeBodyKeypoint
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
@@ -0,0 +1,17 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from easycv.models.pose.top_down import TopDown
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.models.cv.easycv_base import EasyCVBaseModel
|
||||
from modelscope.utils.constant import Tasks
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.human_wholebody_keypoint,
|
||||
module_name=Models.human_wholebody_keypoint)
|
||||
class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
|
||||
TopDown.__init__(self, *args, **kwargs)
|
||||
20
modelscope/models/cv/image_body_reshaping/__init__.py
Normal file
20
modelscope/models/cv/image_body_reshaping/__init__.py
Normal file
@@ -0,0 +1,20 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from typing import TYPE_CHECKING
|
||||
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .image_body_reshaping import ImageBodyReshaping
|
||||
|
||||
else:
|
||||
_import_structure = {'image_body_reshaping': ['ImageBodyReshaping']}
|
||||
|
||||
import sys
|
||||
|
||||
sys.modules[__name__] = LazyImportModule(
|
||||
__name__,
|
||||
globals()['__file__'],
|
||||
_import_structure,
|
||||
module_spec=__spec__,
|
||||
extra_objects={},
|
||||
)
|
||||
@@ -0,0 +1,128 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base import Tensor, TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .model import FlowGenerator
|
||||
from .person_info import PersonInfo
|
||||
from .pose_estimator.body import Body
|
||||
from .slim_utils import image_warp_grid1, resize_on_long_side
|
||||
|
||||
logger = get_logger()
|
||||
|
||||
__all__ = ['ImageBodyReshaping']
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.image_body_reshaping, module_name=Models.image_body_reshaping)
|
||||
class ImageBodyReshaping(TorchModel):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
"""initialize the image body reshaping model from the `model_dir` path.
|
||||
|
||||
Args:
|
||||
model_dir (str): the model path.
|
||||
"""
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
|
||||
if torch.cuda.is_available():
|
||||
self.device = torch.device('cuda')
|
||||
else:
|
||||
self.device = torch.device('cpu')
|
||||
|
||||
self.degree = 1.0
|
||||
self.reshape_model = FlowGenerator(n_channels=16).to(self.device)
|
||||
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
|
||||
checkpoints = torch.load(model_path, map_location=torch.device('cpu'))
|
||||
self.reshape_model.load_state_dict(
|
||||
checkpoints['state_dict'], strict=True)
|
||||
self.reshape_model.eval()
|
||||
logger.info('load body reshaping model done')
|
||||
|
||||
pose_model_ckpt = os.path.join(model_dir, 'body_pose_model.pth')
|
||||
self.pose_esti = Body(pose_model_ckpt, self.device)
|
||||
logger.info('load pose model done')
|
||||
|
||||
def pred_joints(self, img):
|
||||
if img is None:
|
||||
return None
|
||||
small_src, resize_scale = resize_on_long_side(img, 300)
|
||||
body_joints = self.pose_esti(small_src)
|
||||
|
||||
if body_joints.shape[0] >= 1:
|
||||
body_joints[:, :, :2] = body_joints[:, :, :2] / resize_scale
|
||||
|
||||
return body_joints
|
||||
|
||||
def pred_flow(self, img):
|
||||
|
||||
body_joints = self.pred_joints(img)
|
||||
small_size = 1200
|
||||
|
||||
if img.shape[0] > small_size or img.shape[1] > small_size:
|
||||
_img, _scale = resize_on_long_side(img, small_size)
|
||||
body_joints[:, :, :2] = body_joints[:, :, :2] * _scale
|
||||
else:
|
||||
_img = img
|
||||
|
||||
# We only reshape one person
|
||||
if body_joints.shape[0] < 1 or body_joints.shape[0] > 1:
|
||||
return None
|
||||
|
||||
person = PersonInfo(body_joints[0])
|
||||
|
||||
with torch.no_grad():
|
||||
person_pred = person.pred_flow(_img, self.reshape_model,
|
||||
self.device)
|
||||
|
||||
flow = np.dstack((person_pred['rDx'], person_pred['rDy']))
|
||||
|
||||
scale = img.shape[0] * 1.0 / flow.shape[0]
|
||||
|
||||
flow = cv2.resize(flow, (img.shape[1], img.shape[0]))
|
||||
flow *= scale
|
||||
|
||||
return flow
|
||||
|
||||
def warp(self, src_img, flow):
|
||||
|
||||
X_flow = flow[..., 0]
|
||||
Y_flow = flow[..., 1]
|
||||
|
||||
X_flow = np.ascontiguousarray(X_flow)
|
||||
Y_flow = np.ascontiguousarray(Y_flow)
|
||||
|
||||
pred = image_warp_grid1(X_flow, Y_flow, src_img, 1.0, 0, 0)
|
||||
return pred
|
||||
|
||||
def inference(self, img):
|
||||
img = img.cpu().numpy()
|
||||
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
|
||||
flow = self.pred_flow(img)
|
||||
|
||||
if flow is None:
|
||||
return img
|
||||
|
||||
assert flow.shape[:2] == img.shape[:2]
|
||||
|
||||
mag, ang = cv2.cartToPolar(flow[..., 0] + 1e-8, flow[..., 1] + 1e-8)
|
||||
mag -= 3
|
||||
mag[mag <= 0] = 0
|
||||
|
||||
x, y = cv2.polarToCart(mag, ang, angleInDegrees=False)
|
||||
flow = np.dstack((x, y))
|
||||
|
||||
flow *= self.degree
|
||||
pred = self.warp(img, flow)
|
||||
out_img = np.clip(pred, 0, 255)
|
||||
logger.info('model inference done')
|
||||
|
||||
return out_img.astype(np.uint8)
|
||||
189
modelscope/models/cv/image_body_reshaping/model.py
Normal file
189
modelscope/models/cv/image_body_reshaping/model.py
Normal file
@@ -0,0 +1,189 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class ConvLayer(nn.Module):
|
||||
|
||||
def __init__(self, in_ch, out_ch):
|
||||
super(ConvLayer, self).__init__()
|
||||
|
||||
self.conv = nn.Sequential(
|
||||
nn.ReflectionPad2d(1),
|
||||
nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=0),
|
||||
nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True))
|
||||
|
||||
def forward(self, x):
|
||||
x = self.conv(x)
|
||||
return x
|
||||
|
||||
|
||||
class SASA(nn.Module):
|
||||
|
||||
def __init__(self, in_dim):
|
||||
super(SASA, self).__init__()
|
||||
self.chanel_in = in_dim
|
||||
|
||||
self.query_conv = nn.Conv2d(
|
||||
in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
|
||||
self.key_conv = nn.Conv2d(
|
||||
in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
|
||||
self.value_conv = nn.Conv2d(
|
||||
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
|
||||
self.mag_conv = nn.Conv2d(
|
||||
in_channels=5, out_channels=in_dim // 32, kernel_size=1)
|
||||
|
||||
self.gamma = nn.Parameter(torch.zeros(1))
|
||||
|
||||
self.softmax = nn.Softmax(dim=-1) #
|
||||
self.sigmoid = nn.Sigmoid()
|
||||
|
||||
def structure_encoder(self, paf_mag, target_height, target_width):
|
||||
torso_mask = torch.sum(paf_mag[:, 1:3, :, :], dim=1, keepdim=True)
|
||||
torso_mask = torch.clamp(torso_mask, 0, 1)
|
||||
|
||||
arms_mask = torch.sum(paf_mag[:, 4:8, :, :], dim=1, keepdim=True)
|
||||
arms_mask = torch.clamp(arms_mask, 0, 1)
|
||||
|
||||
legs_mask = torch.sum(paf_mag[:, 8:12, :, :], dim=1, keepdim=True)
|
||||
legs_mask = torch.clamp(legs_mask, 0, 1)
|
||||
|
||||
fg_mask = paf_mag[:, 12, :, :].unsqueeze(1)
|
||||
bg_mask = 1 - fg_mask
|
||||
Y = torch.cat((arms_mask, torso_mask, legs_mask, fg_mask, bg_mask),
|
||||
dim=1)
|
||||
Y = F.interpolate(Y, size=(target_height, target_width), mode='area')
|
||||
return Y
|
||||
|
||||
def forward(self, X, PAF_mag):
|
||||
"""extract self-attention features.
|
||||
Args:
|
||||
X : input feature maps( B x C x H x W)
|
||||
PAF_mag : ( B x C x H x W), 1 denotes connectivity, 0 denotes non-connectivity
|
||||
|
||||
Returns:
|
||||
out : self attention value + input feature
|
||||
Y: B X N X N (N is Width*Height)
|
||||
"""
|
||||
|
||||
m_batchsize, C, height, width = X.size()
|
||||
|
||||
Y = self.structure_encoder(PAF_mag, height, width)
|
||||
|
||||
connectivity_mask_vec = self.mag_conv(Y).view(m_batchsize, -1,
|
||||
width * height)
|
||||
affinity = torch.bmm(
|
||||
connectivity_mask_vec.permute(0, 2, 1), connectivity_mask_vec)
|
||||
affinity_centered = affinity - torch.mean(affinity)
|
||||
affinity_sigmoid = self.sigmoid(affinity_centered)
|
||||
|
||||
proj_query = self.query_conv(X).view(m_batchsize, -1,
|
||||
width * height).permute(0, 2, 1)
|
||||
proj_key = self.key_conv(X).view(m_batchsize, -1, width * height)
|
||||
selfatten_map = torch.bmm(proj_query, proj_key)
|
||||
selfatten_centered = selfatten_map - torch.mean(
|
||||
selfatten_map) # centering
|
||||
selfatten_sigmoid = self.sigmoid(selfatten_centered)
|
||||
|
||||
SASA_map = selfatten_sigmoid * affinity_sigmoid
|
||||
|
||||
proj_value = self.value_conv(X).view(m_batchsize, -1, width * height)
|
||||
|
||||
out = torch.bmm(proj_value, SASA_map.permute(0, 2, 1))
|
||||
out = out.view(m_batchsize, C, height, width)
|
||||
|
||||
out = self.gamma * out + X
|
||||
return out, Y
|
||||
|
||||
|
||||
class FlowGenerator(nn.Module):
|
||||
|
||||
def __init__(self, n_channels, deep_supervision=False):
|
||||
super(FlowGenerator, self).__init__()
|
||||
self.deep_supervision = deep_supervision
|
||||
|
||||
self.Encoder = nn.Sequential(
|
||||
ConvLayer(n_channels, 64),
|
||||
ConvLayer(64, 64),
|
||||
nn.MaxPool2d(2),
|
||||
ConvLayer(64, 128),
|
||||
ConvLayer(128, 128),
|
||||
nn.MaxPool2d(2),
|
||||
ConvLayer(128, 256),
|
||||
ConvLayer(256, 256),
|
||||
nn.MaxPool2d(2),
|
||||
ConvLayer(256, 512),
|
||||
ConvLayer(512, 512),
|
||||
nn.MaxPool2d(2),
|
||||
ConvLayer(512, 1024),
|
||||
ConvLayer(1024, 1024),
|
||||
ConvLayer(1024, 1024),
|
||||
ConvLayer(1024, 1024),
|
||||
ConvLayer(1024, 1024),
|
||||
)
|
||||
|
||||
self.SASA = SASA(in_dim=1024)
|
||||
|
||||
self.Decoder = nn.Sequential(
|
||||
ConvLayer(1024, 1024),
|
||||
nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
|
||||
ConvLayer(1024, 512),
|
||||
ConvLayer(512, 512),
|
||||
nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
|
||||
ConvLayer(512, 256),
|
||||
ConvLayer(256, 256),
|
||||
ConvLayer(256, 128),
|
||||
ConvLayer(128, 64),
|
||||
ConvLayer(64, 32),
|
||||
nn.Conv2d(32, 2, kernel_size=1, padding=0),
|
||||
nn.Tanh(),
|
||||
nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True),
|
||||
)
|
||||
|
||||
dilation_ksize = 17
|
||||
self.dilation = torch.nn.MaxPool2d(
|
||||
kernel_size=dilation_ksize,
|
||||
stride=1,
|
||||
padding=int((dilation_ksize - 1) / 2))
|
||||
|
||||
def warp(self, x, flow, mode='bilinear', padding_mode='zeros', coff=0.2):
|
||||
n, c, h, w = x.size()
|
||||
yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)])
|
||||
xv = xv.float() / (w - 1) * 2.0 - 1
|
||||
yv = yv.float() / (h - 1) * 2.0 - 1
|
||||
grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0)
|
||||
grid = grid.to(flow.device)
|
||||
grid_x = grid + 2 * flow * coff
|
||||
warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode)
|
||||
return warp_x
|
||||
|
||||
def forward(self, img, skeleton_map, coef=0.2):
|
||||
"""extract self-attention features.
|
||||
Args:
|
||||
img : input numpy image
|
||||
skeleton_map : skeleton map of input image
|
||||
coef: warp degree
|
||||
|
||||
Returns:
|
||||
warp_x : warped image
|
||||
flow: predicted flow
|
||||
"""
|
||||
|
||||
img_concat = torch.cat((img, skeleton_map), dim=1)
|
||||
X = self.Encoder(img_concat)
|
||||
|
||||
_, _, height, width = X.size()
|
||||
|
||||
# directly get PAF magnitude from skeleton maps via dilation
|
||||
PAF_mag = self.dilation((skeleton_map + 1.0) * 0.5)
|
||||
|
||||
out, Y = self.SASA(X, PAF_mag)
|
||||
flow = self.Decoder(out)
|
||||
|
||||
flow = flow.permute(0, 2, 3, 1) # [n, 2, h, w] ==> [n, h, w, 2]
|
||||
|
||||
warp_x = self.warp(img, flow, coff=coef)
|
||||
warp_x = torch.clamp(warp_x, min=-1.0, max=1.0)
|
||||
|
||||
return warp_x, flow
|
||||
339
modelscope/models/cv/image_body_reshaping/person_info.py
Normal file
339
modelscope/models/cv/image_body_reshaping/person_info.py
Normal file
@@ -0,0 +1,339 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import copy
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
from .slim_utils import (enlarge_box_tblr, gen_skeleton_map,
|
||||
get_map_fusion_map_cuda, get_mask_bbox,
|
||||
resize_on_long_side)
|
||||
|
||||
|
||||
class PersonInfo(object):
|
||||
|
||||
def __init__(self, joints):
|
||||
self.joints = joints
|
||||
self.flow = None
|
||||
self.pad_boder = False
|
||||
self.height_expand = 0
|
||||
self.width_expand = 0
|
||||
self.coeff = 0.2
|
||||
self.network_input_W = 256
|
||||
self.network_input_H = 256
|
||||
self.divider = 20
|
||||
self.flow_scales = ['upper_2']
|
||||
|
||||
def update_attribute(self, pad_boder, height_expand, width_expand):
|
||||
self.pad_boder = pad_boder
|
||||
self.height_expand = height_expand
|
||||
self.width_expand = width_expand
|
||||
if pad_boder:
|
||||
self.joints[:, 0] += width_expand
|
||||
self.joints[:, 1] += height_expand
|
||||
|
||||
def pred_flow(self, img, flow_net, device):
|
||||
with torch.no_grad():
|
||||
if img is None:
|
||||
print('image is none')
|
||||
self.flow = None
|
||||
|
||||
if len(img.shape) == 2:
|
||||
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
|
||||
|
||||
if self.pad_boder:
|
||||
height_expand = self.height_expand
|
||||
width_expand = self.width_expand
|
||||
pad_img = cv2.copyMakeBorder(
|
||||
img,
|
||||
height_expand,
|
||||
height_expand,
|
||||
width_expand,
|
||||
width_expand,
|
||||
cv2.BORDER_CONSTANT,
|
||||
value=(127, 127, 127))
|
||||
|
||||
else:
|
||||
height_expand = 0
|
||||
width_expand = 0
|
||||
pad_img = img.copy()
|
||||
|
||||
canvas = np.zeros(
|
||||
shape=(pad_img.shape[0], pad_img.shape[1]), dtype=np.float32)
|
||||
|
||||
self.human_joint_box = self.__joint_to_body_box()
|
||||
|
||||
self.human_box = enlarge_box_tblr(
|
||||
self.human_joint_box, pad_img, ratio=0.25)
|
||||
human_box_height = self.human_box[1] - self.human_box[0]
|
||||
human_box_width = self.human_box[3] - self.human_box[2]
|
||||
|
||||
self.leg_joint_box = self.__joint_to_leg_box()
|
||||
self.leg_box = enlarge_box_tblr(
|
||||
self.leg_joint_box, pad_img, ratio=0.25)
|
||||
|
||||
self.arm_joint_box = self.__joint_to_arm_box()
|
||||
self.arm_box = enlarge_box_tblr(
|
||||
self.arm_joint_box, pad_img, ratio=0.1)
|
||||
|
||||
x_flows = []
|
||||
y_flows = []
|
||||
multi_bbox = []
|
||||
|
||||
for scale in self.flow_scales: # better for metric
|
||||
scale_value = float(scale.split('_')[-1])
|
||||
|
||||
arm_box = copy.deepcopy(self.arm_box)
|
||||
|
||||
if arm_box[0] is None:
|
||||
arm_box = self.human_box
|
||||
|
||||
arm_box_height = arm_box[1] - arm_box[0]
|
||||
arm_box_width = arm_box[3] - arm_box[2]
|
||||
|
||||
roi_bbox = None
|
||||
|
||||
if arm_box_width < human_box_width * 0.1 or arm_box_height < human_box_height * 0.1:
|
||||
roi_bbox = self.human_box
|
||||
else:
|
||||
arm_box = enlarge_box_tblr(
|
||||
arm_box, pad_img, ratio=scale_value)
|
||||
if scale == 'upper_0.2':
|
||||
arm_box[0] = min(arm_box[0], int(self.joints[0][1]))
|
||||
if scale.startswith('upper'):
|
||||
roi_bbox = [
|
||||
max(self.human_box[0], arm_box[0]),
|
||||
min(self.human_box[1], arm_box[1]),
|
||||
max(self.human_box[2], arm_box[2]),
|
||||
min(self.human_box[3], arm_box[3])
|
||||
]
|
||||
if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
|
||||
3] - roi_bbox[2] < 1:
|
||||
continue
|
||||
|
||||
elif scale.startswith('lower'):
|
||||
roi_bbox = [
|
||||
max(self.human_box[0], self.leg_box[0]),
|
||||
min(self.human_box[1], self.leg_box[1]),
|
||||
max(self.human_box[2], self.leg_box[2]),
|
||||
min(self.human_box[3], self.leg_box[3])
|
||||
]
|
||||
|
||||
if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
|
||||
3] - roi_bbox[2] < 1:
|
||||
continue
|
||||
|
||||
skel_map, roi_bbox = gen_skeleton_map(
|
||||
self.joints, 'depth', input_roi_box=roi_bbox)
|
||||
|
||||
if roi_bbox is None:
|
||||
continue
|
||||
|
||||
if skel_map.dtype != np.float32:
|
||||
skel_map = skel_map.astype(np.float32)
|
||||
|
||||
skel_map -= 1.0 # [0,2] ->[-1,1]
|
||||
|
||||
multi_bbox.append(roi_bbox)
|
||||
|
||||
roi_bbox_height = roi_bbox[1] - roi_bbox[0]
|
||||
roi_bbox_width = roi_bbox[3] - roi_bbox[2]
|
||||
|
||||
assert skel_map.shape[0] == roi_bbox_height
|
||||
assert skel_map.shape[1] == roi_bbox_width
|
||||
roi_height_pad = roi_bbox_height // self.divider
|
||||
roi_width_pad = roi_bbox_width // self.divider
|
||||
paded_roi_h = roi_bbox_height + 2 * roi_height_pad
|
||||
paded_roi_w = roi_bbox_width + 2 * roi_width_pad
|
||||
|
||||
roi_height_pad_joint = skel_map.shape[0] // self.divider
|
||||
roi_width_pad_joint = skel_map.shape[1] // self.divider
|
||||
skel_map = np.pad(
|
||||
skel_map,
|
||||
((roi_height_pad_joint, roi_height_pad_joint),
|
||||
(roi_width_pad_joint, roi_width_pad_joint), (0, 0)),
|
||||
'constant',
|
||||
constant_values=-1)
|
||||
|
||||
skel_map_resized = cv2.resize(
|
||||
skel_map, (self.network_input_W, self.network_input_H))
|
||||
|
||||
skel_map_resized[skel_map_resized < 0] = -1.0
|
||||
skel_map_resized[skel_map_resized > -0.5] = 1.0
|
||||
skel_map_transformed = torch.from_numpy(
|
||||
skel_map_resized.transpose((2, 0, 1)))
|
||||
|
||||
roi_npy = pad_img[roi_bbox[0]:roi_bbox[1],
|
||||
roi_bbox[2]:roi_bbox[3], :].copy()
|
||||
if roi_npy.dtype != np.float32:
|
||||
roi_npy = roi_npy.astype(np.float32)
|
||||
|
||||
roi_npy = np.pad(roi_npy,
|
||||
((roi_height_pad, roi_height_pad),
|
||||
(roi_width_pad, roi_width_pad), (0, 0)),
|
||||
'edge')
|
||||
|
||||
roi_npy = roi_npy[:, :, ::-1]
|
||||
|
||||
roi_npy = cv2.resize(
|
||||
roi_npy, (self.network_input_W, self.network_input_H))
|
||||
|
||||
roi_npy *= 1.0 / 255
|
||||
roi_npy -= 0.5
|
||||
roi_npy *= 2
|
||||
|
||||
rgb_tensor = torch.from_numpy(roi_npy.transpose((2, 0, 1)))
|
||||
|
||||
rgb_tensor = rgb_tensor.unsqueeze(0).to(device)
|
||||
skel_map_tensor = skel_map_transformed.unsqueeze(0).to(device)
|
||||
warped_img_val, flow_field_val = flow_net(
|
||||
rgb_tensor, skel_map_tensor
|
||||
) # inference, connectivity_mask [1,12,16,16]
|
||||
flow_field_val = flow_field_val.detach().squeeze().cpu().numpy(
|
||||
)
|
||||
|
||||
flow_field_val = cv2.resize(
|
||||
flow_field_val, (paded_roi_w, paded_roi_h),
|
||||
interpolation=cv2.INTER_LINEAR)
|
||||
flow_field_val[..., 0] = flow_field_val[
|
||||
..., 0] * paded_roi_w * 0.5 * 2 * self.coeff
|
||||
flow_field_val[..., 1] = flow_field_val[
|
||||
..., 1] * paded_roi_h * 0.5 * 2 * self.coeff
|
||||
|
||||
# remove pad areas
|
||||
flow_field_val = flow_field_val[
|
||||
roi_height_pad:flow_field_val.shape[0] - roi_height_pad,
|
||||
roi_width_pad:flow_field_val.shape[1] - roi_width_pad, :]
|
||||
|
||||
diffuse_width = max(roi_bbox_width // 3, 1)
|
||||
diffuse_height = max(roi_bbox_height // 3, 1)
|
||||
assert roi_bbox_width == flow_field_val.shape[1]
|
||||
assert roi_bbox_height == flow_field_val.shape[0]
|
||||
|
||||
origin_flow = np.zeros(
|
||||
(pad_img.shape[0] + 2 * diffuse_height,
|
||||
pad_img.shape[1] + 2 * diffuse_width, 2),
|
||||
dtype=np.float32)
|
||||
|
||||
flow_field_val = np.pad(flow_field_val,
|
||||
((diffuse_height, diffuse_height),
|
||||
(diffuse_width, diffuse_width),
|
||||
(0, 0)), 'linear_ramp')
|
||||
|
||||
origin_flow[roi_bbox[0]:roi_bbox[1] + 2 * diffuse_height,
|
||||
roi_bbox[2]:roi_bbox[3]
|
||||
+ 2 * diffuse_width] = flow_field_val
|
||||
|
||||
origin_flow = origin_flow[diffuse_height:-diffuse_height,
|
||||
diffuse_width:-diffuse_width, :]
|
||||
|
||||
x_flows.append(origin_flow[..., 0])
|
||||
y_flows.append(origin_flow[..., 1])
|
||||
|
||||
if len(x_flows) == 0:
|
||||
return {
|
||||
'rDx': np.zeros(canvas.shape[:2], dtype=np.float32),
|
||||
'rDy': np.zeros(canvas.shape[:2], dtype=np.float32),
|
||||
'multi_bbox': multi_bbox,
|
||||
'x_fusion_map':
|
||||
np.ones(canvas.shape[:2], dtype=np.float32),
|
||||
'y_fusion_map':
|
||||
np.ones(canvas.shape[:2], dtype=np.float32)
|
||||
}
|
||||
else:
|
||||
origin_rDx, origin_rDy, x_fusion_map, y_fusion_map = self.blend_multiscale_flow(
|
||||
x_flows, y_flows, device=device)
|
||||
|
||||
return {
|
||||
'rDx': origin_rDx,
|
||||
'rDy': origin_rDy,
|
||||
'multi_bbox': multi_bbox,
|
||||
'x_fusion_map': x_fusion_map,
|
||||
'y_fusion_map': y_fusion_map
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def blend_multiscale_flow(x_flows, y_flows, device=None):
|
||||
scale_num = len(x_flows)
|
||||
if scale_num == 1:
|
||||
return x_flows[0], y_flows[0], np.ones_like(
|
||||
x_flows[0]), np.ones_like(x_flows[0])
|
||||
|
||||
origin_rDx = np.zeros((x_flows[0].shape[0], x_flows[0].shape[1]),
|
||||
dtype=np.float32)
|
||||
origin_rDy = np.zeros((y_flows[0].shape[0], y_flows[0].shape[1]),
|
||||
dtype=np.float32)
|
||||
|
||||
x_fusion_map, x_acc_map = get_map_fusion_map_cuda(
|
||||
x_flows, 1, device=device)
|
||||
y_fusion_map, y_acc_map = get_map_fusion_map_cuda(
|
||||
y_flows, 1, device=device)
|
||||
|
||||
x_flow_map = 1.0 / x_fusion_map
|
||||
y_flow_map = 1.0 / y_fusion_map
|
||||
|
||||
all_acc_map = x_acc_map + y_acc_map
|
||||
all_acc_map = all_acc_map.astype(np.uint8)
|
||||
roi_box = get_mask_bbox(all_acc_map, threshold=1)
|
||||
|
||||
if roi_box[0] is None or roi_box[1] - roi_box[0] <= 0 or roi_box[
|
||||
3] - roi_box[2] <= 0:
|
||||
roi_box = [0, x_flow_map.shape[0], 0, x_flow_map.shape[1]]
|
||||
|
||||
roi_x_flow_map = x_flow_map[roi_box[0]:roi_box[1],
|
||||
roi_box[2]:roi_box[3]]
|
||||
roi_y_flow_map = y_flow_map[roi_box[0]:roi_box[1],
|
||||
roi_box[2]:roi_box[3]]
|
||||
|
||||
roi_width = roi_x_flow_map.shape[1]
|
||||
roi_height = roi_x_flow_map.shape[0]
|
||||
|
||||
roi_x_flow_map, scale = resize_on_long_side(roi_x_flow_map, 320)
|
||||
roi_y_flow_map, scale = resize_on_long_side(roi_y_flow_map, 320)
|
||||
|
||||
roi_x_flow_map = cv2.blur(roi_x_flow_map, (55, 55))
|
||||
roi_y_flow_map = cv2.blur(roi_y_flow_map, (55, 55))
|
||||
|
||||
roi_x_flow_map = cv2.resize(roi_x_flow_map, (roi_width, roi_height))
|
||||
roi_y_flow_map = cv2.resize(roi_y_flow_map, (roi_width, roi_height))
|
||||
|
||||
x_flow_map[roi_box[0]:roi_box[1],
|
||||
roi_box[2]:roi_box[3]] = roi_x_flow_map
|
||||
y_flow_map[roi_box[0]:roi_box[1],
|
||||
roi_box[2]:roi_box[3]] = roi_y_flow_map
|
||||
|
||||
for i in range(scale_num):
|
||||
origin_rDx += x_flows[i]
|
||||
origin_rDy += y_flows[i]
|
||||
|
||||
origin_rDx *= x_flow_map
|
||||
origin_rDy *= y_flow_map
|
||||
|
||||
return origin_rDx, origin_rDy, x_flow_map, y_flow_map
|
||||
|
||||
def __joint_to_body_box(self):
|
||||
joint_left = int(np.min(self.joints, axis=0)[0])
|
||||
joint_right = int(np.max(self.joints, axis=0)[0])
|
||||
joint_top = int(np.min(self.joints, axis=0)[1])
|
||||
joint_bottom = int(np.max(self.joints, axis=0)[1])
|
||||
return [joint_top, joint_bottom, joint_left, joint_right]
|
||||
|
||||
def __joint_to_leg_box(self):
|
||||
leg_joints = self.joints[8:, :]
|
||||
if np.max(leg_joints, axis=0)[2] < 0.05:
|
||||
return [0, 0, 0, 0]
|
||||
joint_left = int(np.min(leg_joints, axis=0)[0])
|
||||
joint_right = int(np.max(leg_joints, axis=0)[0])
|
||||
joint_top = int(np.min(leg_joints, axis=0)[1])
|
||||
joint_bottom = int(np.max(leg_joints, axis=0)[1])
|
||||
return [joint_top, joint_bottom, joint_left, joint_right]
|
||||
|
||||
def __joint_to_arm_box(self):
|
||||
arm_joints = self.joints[2:8, :]
|
||||
if np.max(arm_joints, axis=0)[2] < 0.05:
|
||||
return [0, 0, 0, 0]
|
||||
joint_left = int(np.min(arm_joints, axis=0)[0])
|
||||
joint_right = int(np.max(arm_joints, axis=0)[0])
|
||||
joint_top = int(np.min(arm_joints, axis=0)[1])
|
||||
joint_bottom = int(np.max(arm_joints, axis=0)[1])
|
||||
return [joint_top, joint_bottom, joint_left, joint_right]
|
||||
272
modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
Normal file
272
modelscope/models/cv/image_body_reshaping/pose_estimator/body.py
Normal file
@@ -0,0 +1,272 @@
|
||||
# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
|
||||
|
||||
import math
|
||||
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
from scipy.ndimage.filters import gaussian_filter
|
||||
|
||||
from .model import BodyposeModel
|
||||
from .util import pad_rightdown_corner, transfer
|
||||
|
||||
|
||||
class Body(object):
|
||||
|
||||
def __init__(self, model_path, device):
|
||||
self.model = BodyposeModel().to(device)
|
||||
model_dict = transfer(self.model, torch.load(model_path))
|
||||
self.model.load_state_dict(model_dict)
|
||||
self.model.eval()
|
||||
|
||||
def __call__(self, oriImg):
|
||||
scale_search = [0.5]
|
||||
boxsize = 368
|
||||
stride = 8
|
||||
padValue = 128
|
||||
thre1 = 0.1
|
||||
thre2 = 0.05
|
||||
bodyparts = 18
|
||||
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
|
||||
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
|
||||
paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
|
||||
|
||||
for m in range(len(multiplier)):
|
||||
scale = multiplier[m]
|
||||
imageToTest = cv2.resize(
|
||||
oriImg, (0, 0),
|
||||
fx=scale,
|
||||
fy=scale,
|
||||
interpolation=cv2.INTER_CUBIC)
|
||||
imageToTest_padded, pad = pad_rightdown_corner(
|
||||
imageToTest, stride, padValue)
|
||||
im = np.transpose(
|
||||
np.float32(imageToTest_padded[:, :, :, np.newaxis]),
|
||||
(3, 2, 0, 1)) / 256 - 0.5
|
||||
im = np.ascontiguousarray(im)
|
||||
|
||||
data = torch.from_numpy(im).float()
|
||||
if torch.cuda.is_available():
|
||||
data = data.cuda()
|
||||
with torch.no_grad():
|
||||
Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
|
||||
Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
|
||||
Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
|
||||
|
||||
# extract outputs, resize, and remove padding
|
||||
heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2),
|
||||
(1, 2, 0)) # output 1 is heatmaps
|
||||
heatmap = cv2.resize(
|
||||
heatmap, (0, 0),
|
||||
fx=stride,
|
||||
fy=stride,
|
||||
interpolation=cv2.INTER_CUBIC)
|
||||
heatmap = heatmap[:imageToTest_padded.shape[0]
|
||||
- pad[2], :imageToTest_padded.shape[1]
|
||||
- pad[3], :]
|
||||
heatmap = cv2.resize(
|
||||
heatmap, (oriImg.shape[1], oriImg.shape[0]),
|
||||
interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
paf = np.transpose(np.squeeze(Mconv7_stage6_L1),
|
||||
(1, 2, 0)) # output 0 is PAFs
|
||||
paf = cv2.resize(
|
||||
paf, (0, 0),
|
||||
fx=stride,
|
||||
fy=stride,
|
||||
interpolation=cv2.INTER_CUBIC)
|
||||
paf = paf[:imageToTest_padded.shape[0]
|
||||
- pad[2], :imageToTest_padded.shape[1] - pad[3], :]
|
||||
paf = cv2.resize(
|
||||
paf, (oriImg.shape[1], oriImg.shape[0]),
|
||||
interpolation=cv2.INTER_CUBIC)
|
||||
|
||||
heatmap_avg += heatmap_avg + heatmap / len(multiplier)
|
||||
paf_avg += +paf / len(multiplier)
|
||||
|
||||
all_peaks = []
|
||||
peak_counter = 0
|
||||
|
||||
for part in range(bodyparts):
|
||||
map_ori = heatmap_avg[:, :, part]
|
||||
one_heatmap = gaussian_filter(map_ori, sigma=3)
|
||||
|
||||
map_left = np.zeros(one_heatmap.shape)
|
||||
map_left[1:, :] = one_heatmap[:-1, :]
|
||||
map_right = np.zeros(one_heatmap.shape)
|
||||
map_right[:-1, :] = one_heatmap[1:, :]
|
||||
map_up = np.zeros(one_heatmap.shape)
|
||||
map_up[:, 1:] = one_heatmap[:, :-1]
|
||||
map_down = np.zeros(one_heatmap.shape)
|
||||
map_down[:, :-1] = one_heatmap[:, 1:]
|
||||
|
||||
peaks_binary = np.logical_and.reduce(
|
||||
(one_heatmap >= map_left, one_heatmap >= map_right,
|
||||
one_heatmap >= map_up, one_heatmap >= map_down,
|
||||
one_heatmap > thre1))
|
||||
peaks = list(
|
||||
zip(np.nonzero(peaks_binary)[1],
|
||||
np.nonzero(peaks_binary)[0])) # note reverse
|
||||
peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks]
|
||||
peak_id = range(peak_counter, peak_counter + len(peaks))
|
||||
peaks_with_score_and_id = [
|
||||
peaks_with_score[i] + (peak_id[i], )
|
||||
for i in range(len(peak_id))
|
||||
]
|
||||
|
||||
all_peaks.append(peaks_with_score_and_id)
|
||||
peak_counter += len(peaks)
|
||||
|
||||
# find connection in the specified sequence, center 29 is in the position 15
|
||||
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9],
|
||||
[9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1],
|
||||
[1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]]
|
||||
# the middle joints heatmap correpondence
|
||||
mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44],
|
||||
[19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30],
|
||||
[47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38],
|
||||
[45, 46]]
|
||||
|
||||
connection_all = []
|
||||
special_k = []
|
||||
mid_num = 10
|
||||
|
||||
for k in range(len(mapIdx)):
|
||||
score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
|
||||
candA = all_peaks[limbSeq[k][0] - 1]
|
||||
candB = all_peaks[limbSeq[k][1] - 1]
|
||||
nA = len(candA)
|
||||
nB = len(candB)
|
||||
if (nA != 0 and nB != 0):
|
||||
connection_candidate = []
|
||||
for i in range(nA):
|
||||
for j in range(nB):
|
||||
vec = np.subtract(candB[j][:2], candA[i][:2])
|
||||
norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
|
||||
norm = max(0.001, norm)
|
||||
vec = np.divide(vec, norm)
|
||||
|
||||
startend = list(
|
||||
zip(
|
||||
np.linspace(
|
||||
candA[i][0], candB[j][0], num=mid_num),
|
||||
np.linspace(
|
||||
candA[i][1], candB[j][1], num=mid_num)))
|
||||
|
||||
vec_x = np.array([
|
||||
score_mid[int(round(startend[item][1])),
|
||||
int(round(startend[item][0])), 0]
|
||||
for item in range(len(startend))
|
||||
])
|
||||
vec_y = np.array([
|
||||
score_mid[int(round(startend[item][1])),
|
||||
int(round(startend[item][0])), 1]
|
||||
for item in range(len(startend))
|
||||
])
|
||||
|
||||
score_midpts = np.multiply(
|
||||
vec_x, vec[0]) + np.multiply(vec_y, vec[1])
|
||||
temp1 = sum(score_midpts) / len(score_midpts)
|
||||
temp2 = min(0.5 * oriImg.shape[0] / norm - 1, 0)
|
||||
score_with_dist_prior = temp1 + temp2
|
||||
criterion1 = len(np.nonzero(
|
||||
score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
|
||||
criterion2 = score_with_dist_prior > 0
|
||||
if criterion1 and criterion2:
|
||||
connection_candidate.append([
|
||||
i, j, score_with_dist_prior,
|
||||
score_with_dist_prior + candA[i][2]
|
||||
+ candB[j][2]
|
||||
])
|
||||
|
||||
connection_candidate = sorted(
|
||||
connection_candidate, key=lambda x: x[2], reverse=True)
|
||||
connection = np.zeros((0, 5))
|
||||
for c in range(len(connection_candidate)):
|
||||
i, j, s = connection_candidate[c][0:3]
|
||||
if (i not in connection[:, 3]
|
||||
and j not in connection[:, 4]):
|
||||
connection = np.vstack(
|
||||
[connection, [candA[i][3], candB[j][3], s, i, j]])
|
||||
if (len(connection) >= min(nA, nB)):
|
||||
break
|
||||
|
||||
connection_all.append(connection)
|
||||
else:
|
||||
special_k.append(k)
|
||||
connection_all.append([])
|
||||
|
||||
# last number in each row is the total parts number of that person
|
||||
# the second last number in each row is the score of the overall configuration
|
||||
subset = -1 * np.ones((0, 20))
|
||||
candidate = np.array(
|
||||
[item for sublist in all_peaks for item in sublist])
|
||||
|
||||
for k in range(len(mapIdx)):
|
||||
if k not in special_k:
|
||||
partAs = connection_all[k][:, 0]
|
||||
partBs = connection_all[k][:, 1]
|
||||
indexA, indexB = np.array(limbSeq[k]) - 1
|
||||
|
||||
for i in range(len(connection_all[k])): # = 1:size(temp,1)
|
||||
found = 0
|
||||
subset_idx = [-1, -1]
|
||||
for j in range(len(subset)): # 1:size(subset,1):
|
||||
if subset[j][indexA] == partAs[i] or subset[j][
|
||||
indexB] == partBs[i]:
|
||||
subset_idx[found] = j
|
||||
found += 1
|
||||
|
||||
if found == 1:
|
||||
j = subset_idx[0]
|
||||
if subset[j][indexB] != partBs[i]:
|
||||
subset[j][indexB] = partBs[i]
|
||||
subset[j][-1] += 1
|
||||
subset[j][-2] += candidate[
|
||||
partBs[i].astype(int),
|
||||
2] + connection_all[k][i][2]
|
||||
elif found == 2: # if found 2 and disjoint, merge them
|
||||
j1, j2 = subset_idx
|
||||
tmp1 = (subset[j1] >= 0).astype(int)
|
||||
tmp2 = (subset[j2] >= 0).astype(int)
|
||||
membership = (tmp1 + tmp2)[:-2]
|
||||
if len(np.nonzero(membership == 2)[0]) == 0: # merge
|
||||
subset[j1][:-2] += (subset[j2][:-2] + 1)
|
||||
subset[j1][-2:] += subset[j2][-2:]
|
||||
subset[j1][-2] += connection_all[k][i][2]
|
||||
subset = np.delete(subset, j2, 0)
|
||||
else: # as like found == 1
|
||||
subset[j1][indexB] = partBs[i]
|
||||
subset[j1][-1] += 1
|
||||
subset[j1][-2] += candidate[
|
||||
partBs[i].astype(int),
|
||||
2] + connection_all[k][i][2]
|
||||
|
||||
# if find no partA in the subset, create a new subset
|
||||
elif not found and k < 17:
|
||||
row = -1 * np.ones(20)
|
||||
row[indexA] = partAs[i]
|
||||
row[indexB] = partBs[i]
|
||||
row[-1] = 2
|
||||
row[-2] = sum(
|
||||
candidate[connection_all[k][i, :2].astype(int),
|
||||
2]) + connection_all[k][i][2]
|
||||
subset = np.vstack([subset, row])
|
||||
# delete some rows of subset which has few parts occur
|
||||
deleteIdx = []
|
||||
for i in range(len(subset)):
|
||||
if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
|
||||
deleteIdx.append(i)
|
||||
subset = np.delete(subset, deleteIdx, axis=0)
|
||||
|
||||
# subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
|
||||
# candidate: x, y, score, id
|
||||
count = subset.shape[0]
|
||||
joints = np.zeros(shape=(count, bodyparts, 3))
|
||||
|
||||
for i in range(count):
|
||||
for j in range(bodyparts):
|
||||
joints[i, j, :3] = candidate[int(subset[i, j]), :3]
|
||||
confidence = 1.0 if subset[i, j] >= 0 else 0.0
|
||||
joints[i, j, 2] *= confidence
|
||||
return joints
|
||||
@@ -0,0 +1,141 @@
|
||||
# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
|
||||
|
||||
from collections import OrderedDict
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
def make_layers(block, no_relu_layers):
|
||||
layers = []
|
||||
for layer_name, v in block.items():
|
||||
if 'pool' in layer_name:
|
||||
layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])
|
||||
layers.append((layer_name, layer))
|
||||
else:
|
||||
conv2d = nn.Conv2d(
|
||||
in_channels=v[0],
|
||||
out_channels=v[1],
|
||||
kernel_size=v[2],
|
||||
stride=v[3],
|
||||
padding=v[4])
|
||||
layers.append((layer_name, conv2d))
|
||||
if layer_name not in no_relu_layers:
|
||||
layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
|
||||
|
||||
return nn.Sequential(OrderedDict(layers))
|
||||
|
||||
|
||||
class BodyposeModel(nn.Module):
|
||||
|
||||
def __init__(self):
|
||||
super(BodyposeModel, self).__init__()
|
||||
|
||||
# these layers have no relu layer
|
||||
no_relu_layers = [
|
||||
'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',
|
||||
'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',
|
||||
'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',
|
||||
'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1'
|
||||
]
|
||||
blocks = {}
|
||||
block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]),
|
||||
('conv1_2', [64, 64, 3, 1, 1]),
|
||||
('pool1_stage1', [2, 2, 0]),
|
||||
('conv2_1', [64, 128, 3, 1, 1]),
|
||||
('conv2_2', [128, 128, 3, 1, 1]),
|
||||
('pool2_stage1', [2, 2, 0]),
|
||||
('conv3_1', [128, 256, 3, 1, 1]),
|
||||
('conv3_2', [256, 256, 3, 1, 1]),
|
||||
('conv3_3', [256, 256, 3, 1, 1]),
|
||||
('conv3_4', [256, 256, 3, 1, 1]),
|
||||
('pool3_stage1', [2, 2, 0]),
|
||||
('conv4_1', [256, 512, 3, 1, 1]),
|
||||
('conv4_2', [512, 512, 3, 1, 1]),
|
||||
('conv4_3_CPM', [512, 256, 3, 1, 1]),
|
||||
('conv4_4_CPM', [256, 128, 3, 1, 1])])
|
||||
|
||||
# Stage 1
|
||||
block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
|
||||
('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
|
||||
('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
|
||||
('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
|
||||
('conv5_5_CPM_L1', [512, 38, 1, 1, 0])])
|
||||
|
||||
block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
|
||||
('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
|
||||
('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
|
||||
('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
|
||||
('conv5_5_CPM_L2', [512, 19, 1, 1, 0])])
|
||||
blocks['block1_1'] = block1_1
|
||||
blocks['block1_2'] = block1_2
|
||||
|
||||
self.model0 = make_layers(block0, no_relu_layers)
|
||||
|
||||
# Stages 2 - 6
|
||||
for i in range(2, 7):
|
||||
blocks['block%d_1' % i] = OrderedDict([
|
||||
('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
|
||||
('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
|
||||
('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
|
||||
])
|
||||
|
||||
blocks['block%d_2' % i] = OrderedDict([
|
||||
('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
|
||||
('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
|
||||
('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
|
||||
('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
|
||||
])
|
||||
|
||||
for k in blocks.keys():
|
||||
blocks[k] = make_layers(blocks[k], no_relu_layers)
|
||||
|
||||
self.model1_1 = blocks['block1_1']
|
||||
self.model2_1 = blocks['block2_1']
|
||||
self.model3_1 = blocks['block3_1']
|
||||
self.model4_1 = blocks['block4_1']
|
||||
self.model5_1 = blocks['block5_1']
|
||||
self.model6_1 = blocks['block6_1']
|
||||
|
||||
self.model1_2 = blocks['block1_2']
|
||||
self.model2_2 = blocks['block2_2']
|
||||
self.model3_2 = blocks['block3_2']
|
||||
self.model4_2 = blocks['block4_2']
|
||||
self.model5_2 = blocks['block5_2']
|
||||
self.model6_2 = blocks['block6_2']
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
out1 = self.model0(x)
|
||||
|
||||
out1_1 = self.model1_1(out1)
|
||||
out1_2 = self.model1_2(out1)
|
||||
out2 = torch.cat([out1_1, out1_2, out1], 1)
|
||||
|
||||
out2_1 = self.model2_1(out2)
|
||||
out2_2 = self.model2_2(out2)
|
||||
out3 = torch.cat([out2_1, out2_2, out1], 1)
|
||||
|
||||
out3_1 = self.model3_1(out3)
|
||||
out3_2 = self.model3_2(out3)
|
||||
out4 = torch.cat([out3_1, out3_2, out1], 1)
|
||||
|
||||
out4_1 = self.model4_1(out4)
|
||||
out4_2 = self.model4_2(out4)
|
||||
out5 = torch.cat([out4_1, out4_2, out1], 1)
|
||||
|
||||
out5_1 = self.model5_1(out5)
|
||||
out5_2 = self.model5_2(out5)
|
||||
out6 = torch.cat([out5_1, out5_2, out1], 1)
|
||||
|
||||
out6_1 = self.model6_1(out6)
|
||||
out6_2 = self.model6_2(out6)
|
||||
|
||||
return out6_1, out6_2
|
||||
@@ -0,0 +1,33 @@
|
||||
# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
|
||||
import numpy as np
|
||||
|
||||
|
||||
def pad_rightdown_corner(img, stride, padValue):
|
||||
h = img.shape[0]
|
||||
w = img.shape[1]
|
||||
|
||||
pad = 4 * [None]
|
||||
pad[0] = 0 # up
|
||||
pad[1] = 0 # left
|
||||
pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
|
||||
pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
|
||||
|
||||
img_padded = img
|
||||
pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
|
||||
img_padded = np.concatenate((pad_up, img_padded), axis=0)
|
||||
pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
|
||||
img_padded = np.concatenate((pad_left, img_padded), axis=1)
|
||||
pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
|
||||
img_padded = np.concatenate((img_padded, pad_down), axis=0)
|
||||
pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
|
||||
img_padded = np.concatenate((img_padded, pad_right), axis=1)
|
||||
|
||||
return img_padded, pad
|
||||
|
||||
|
||||
def transfer(model, model_weights):
|
||||
transfered_model_weights = {}
|
||||
for weights_name in model.state_dict().keys():
|
||||
transfered_model_weights[weights_name] = model_weights['.'.join(
|
||||
weights_name.split('.')[1:])]
|
||||
return transfered_model_weights
|
||||
507
modelscope/models/cv/image_body_reshaping/slim_utils.py
Normal file
507
modelscope/models/cv/image_body_reshaping/slim_utils.py
Normal file
@@ -0,0 +1,507 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
|
||||
import math
|
||||
import os
|
||||
import random
|
||||
|
||||
import cv2
|
||||
import numba
|
||||
import numpy as np
|
||||
import torch
|
||||
|
||||
|
||||
def resize_on_long_side(img, long_side=800):
|
||||
src_height = img.shape[0]
|
||||
src_width = img.shape[1]
|
||||
|
||||
if src_height > src_width:
|
||||
scale = long_side * 1.0 / src_height
|
||||
_img = cv2.resize(
|
||||
img, (int(src_width * scale), long_side),
|
||||
interpolation=cv2.INTER_LINEAR)
|
||||
else:
|
||||
scale = long_side * 1.0 / src_width
|
||||
_img = cv2.resize(
|
||||
img, (long_side, int(src_height * scale)),
|
||||
interpolation=cv2.INTER_LINEAR)
|
||||
|
||||
return _img, scale
|
||||
|
||||
|
||||
def point_in_box(pt, box):
|
||||
pt_x = pt[0]
|
||||
pt_y = pt[1]
|
||||
|
||||
if pt_x >= box[0] and pt_x <= box[0] + box[2] and pt_y >= box[
|
||||
1] and pt_y <= box[1] + box[3]:
|
||||
return True
|
||||
else:
|
||||
return False
|
||||
|
||||
|
||||
def enlarge_box_tblr(roi_bbox, mask, ratio=0.4, use_long_side=True):
|
||||
if roi_bbox is None or None in roi_bbox:
|
||||
return [None, None, None, None]
|
||||
|
||||
top = roi_bbox[0]
|
||||
bottom = roi_bbox[1]
|
||||
left = roi_bbox[2]
|
||||
right = roi_bbox[3]
|
||||
|
||||
roi_width = roi_bbox[3] - roi_bbox[2]
|
||||
roi_height = roi_bbox[1] - roi_bbox[0]
|
||||
right = left + roi_width
|
||||
bottom = top + roi_height
|
||||
|
||||
long_side = roi_width if roi_width > roi_height else roi_height
|
||||
|
||||
if use_long_side:
|
||||
new_left = left - int(long_side * ratio)
|
||||
else:
|
||||
new_left = left - int(roi_width * ratio)
|
||||
new_left = 1 if new_left < 0 else new_left
|
||||
|
||||
if use_long_side:
|
||||
new_top = top - int(long_side * ratio)
|
||||
else:
|
||||
new_top = top - int(roi_height * ratio)
|
||||
new_top = 1 if new_top < 0 else new_top
|
||||
|
||||
if use_long_side:
|
||||
new_right = right + int(long_side * ratio)
|
||||
else:
|
||||
new_right = right + int(roi_width * ratio)
|
||||
new_right = mask.shape[1] - 2 if new_right > mask.shape[1] else new_right
|
||||
|
||||
if use_long_side:
|
||||
new_bottom = bottom + int(long_side * ratio)
|
||||
else:
|
||||
new_bottom = bottom + int(roi_height * ratio)
|
||||
new_bottom = mask.shape[0] - 2 if new_bottom > mask.shape[0] else new_bottom
|
||||
|
||||
bbox = [new_top, new_bottom, new_left, new_right]
|
||||
return bbox
|
||||
|
||||
|
||||
def gen_PAF(image, joints):
|
||||
|
||||
assert joints.shape[0] == 18
|
||||
assert joints.shape[1] == 3
|
||||
|
||||
org_h = image.shape[0]
|
||||
org_w = image.shape[1]
|
||||
small_image, resize_scale = resize_on_long_side(image, 120)
|
||||
|
||||
joints[:, :2] = joints[:, :2] * resize_scale
|
||||
|
||||
joint_left = int(np.min(joints, axis=0)[0])
|
||||
joint_right = int(np.max(joints, axis=0)[0])
|
||||
joint_top = int(np.min(joints, axis=0)[1])
|
||||
joint_bottom = int(np.max(joints, axis=0)[1])
|
||||
|
||||
limb_width = min(
|
||||
abs(joint_right - joint_left), abs(joint_bottom - joint_top)) // 6
|
||||
|
||||
if limb_width % 2 == 0:
|
||||
limb_width += 1
|
||||
kernel_size = limb_width
|
||||
|
||||
part_orders = [(5, 11), (2, 8), (5, 6), (6, 7), (2, 3), (3, 4), (11, 12),
|
||||
(12, 13), (8, 9), (9, 10)]
|
||||
|
||||
map_list = []
|
||||
mask_list = []
|
||||
PAF_all = np.zeros(
|
||||
shape=(small_image.shape[0], small_image.shape[1], 2),
|
||||
dtype=np.float32)
|
||||
for c, pair in enumerate(part_orders):
|
||||
idx_a_name = pair[0]
|
||||
idx_b_name = pair[1]
|
||||
|
||||
jointa = joints[idx_a_name]
|
||||
jointb = joints[idx_b_name]
|
||||
|
||||
confidence_threshold = 0.05
|
||||
if jointa[2] > confidence_threshold and jointb[
|
||||
2] > confidence_threshold:
|
||||
canvas = np.zeros(
|
||||
shape=(small_image.shape[0], small_image.shape[1]),
|
||||
dtype=np.uint8)
|
||||
|
||||
canvas = cv2.line(canvas, (int(jointa[0]), int(jointa[1])),
|
||||
(int(jointb[0]), int(jointb[1])),
|
||||
(255, 255, 255), 5)
|
||||
|
||||
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
|
||||
(kernel_size, kernel_size))
|
||||
|
||||
canvas = cv2.dilate(canvas, kernel, 1)
|
||||
canvas = cv2.GaussianBlur(canvas, (kernel_size, kernel_size), 0)
|
||||
canvas = canvas.astype(np.float32) / 255
|
||||
PAF = np.zeros(
|
||||
shape=(small_image.shape[0], small_image.shape[1], 2),
|
||||
dtype=np.float32)
|
||||
PAF[..., 0] = jointb[0] - jointa[0]
|
||||
PAF[..., 1] = jointb[1] - jointa[1]
|
||||
mag, ang = cv2.cartToPolar(PAF[..., 0], PAF[..., 1])
|
||||
PAF /= (np.dstack((mag, mag)) + 1e-5)
|
||||
|
||||
single_PAF = PAF * np.dstack((canvas, canvas))
|
||||
map_list.append(
|
||||
cv2.GaussianBlur(single_PAF,
|
||||
(kernel_size * 3, kernel_size * 3), 0))
|
||||
|
||||
mask_list.append(
|
||||
cv2.GaussianBlur(canvas.copy(),
|
||||
(kernel_size * 3, kernel_size * 3), 0))
|
||||
PAF_all = PAF_all * (1.0 - np.dstack(
|
||||
(canvas, canvas))) + single_PAF
|
||||
|
||||
PAF_all = cv2.GaussianBlur(PAF_all, (kernel_size * 3, kernel_size * 3), 0)
|
||||
PAF_all = cv2.resize(
|
||||
PAF_all, (org_w, org_h), interpolation=cv2.INTER_LINEAR)
|
||||
map_list.append(PAF_all)
|
||||
return PAF_all, map_list, mask_list
|
||||
|
||||
|
||||
def gen_skeleton_map(joints, stack_mode='column', input_roi_box=None):
|
||||
if type(joints) == list:
|
||||
joints = np.array(joints)
|
||||
assert stack_mode == 'column' or stack_mode == 'depth'
|
||||
|
||||
part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
|
||||
(3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]
|
||||
|
||||
def link(img, a, b, color, line_width, scale=1.0, x_offset=0, y_offset=0):
|
||||
jointa = joints[a]
|
||||
jointb = joints[b]
|
||||
|
||||
temp1 = int((jointa[0] - x_offset) * scale)
|
||||
temp2 = int((jointa[1] - y_offset) * scale)
|
||||
temp3 = int((jointb[0] - x_offset) * scale)
|
||||
temp4 = int((jointb[1] - y_offset) * scale)
|
||||
|
||||
cv2.line(img, (temp1, temp2), (temp3, temp4), color, line_width)
|
||||
|
||||
roi_box = input_roi_box
|
||||
|
||||
roi_box_width = roi_box[3] - roi_box[2]
|
||||
roi_box_height = roi_box[1] - roi_box[0]
|
||||
short_side_length = min(roi_box_width, roi_box_height)
|
||||
line_width = short_side_length // 30
|
||||
|
||||
line_width = max(line_width, 2)
|
||||
|
||||
map_cube = np.zeros(
|
||||
shape=(roi_box_height, roi_box_width, len(part_orders) + 1),
|
||||
dtype=np.float32)
|
||||
|
||||
use_line_width = min(5, line_width)
|
||||
fx = use_line_width * 1.0 / line_width # fx 最大值为1
|
||||
|
||||
if fx < 0.99:
|
||||
map_cube = cv2.resize(map_cube, (0, 0), fx=fx, fy=fx)
|
||||
|
||||
for c, pair in enumerate(part_orders):
|
||||
tmp = map_cube[..., c].copy()
|
||||
link(
|
||||
tmp,
|
||||
pair[0],
|
||||
pair[1], (2.0, 2.0, 2.0),
|
||||
use_line_width,
|
||||
scale=fx,
|
||||
x_offset=roi_box[2],
|
||||
y_offset=roi_box[0])
|
||||
map_cube[..., c] = tmp
|
||||
|
||||
tmp = map_cube[..., -1].copy()
|
||||
link(
|
||||
tmp,
|
||||
pair[0],
|
||||
pair[1], (2.0, 2.0, 2.0),
|
||||
use_line_width,
|
||||
scale=fx,
|
||||
x_offset=roi_box[2],
|
||||
y_offset=roi_box[0])
|
||||
map_cube[..., -1] = tmp
|
||||
|
||||
map_cube = cv2.resize(map_cube, (roi_box_width, roi_box_height))
|
||||
|
||||
if stack_mode == 'depth':
|
||||
return map_cube, roi_box
|
||||
elif stack_mode == 'column':
|
||||
joint_maps = []
|
||||
for c in range(len(part_orders) + 1):
|
||||
joint_maps.append(map_cube[..., c])
|
||||
joint_map = np.column_stack(joint_maps)
|
||||
|
||||
return joint_map, roi_box
|
||||
|
||||
|
||||
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
|
||||
tl = line_thickness or round(
|
||||
0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness
|
||||
color = color or [random.randint(0, 255) for _ in range(3)]
|
||||
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
|
||||
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
|
||||
if label:
|
||||
tf = max(tl - 1, 1) # font thickness
|
||||
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
|
||||
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
|
||||
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
|
||||
cv2.putText(
|
||||
img,
|
||||
label, (c1[0], c1[1] - 2),
|
||||
0,
|
||||
tl / 3, [225, 255, 255],
|
||||
thickness=tf,
|
||||
lineType=cv2.LINE_AA)
|
||||
|
||||
|
||||
def draw_line(im, points, color, stroke_size=2, closed=False):
|
||||
points = points.astype(np.int32)
|
||||
for i in range(len(points) - 1):
|
||||
cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color,
|
||||
stroke_size)
|
||||
if closed:
|
||||
cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size)
|
||||
|
||||
|
||||
def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2):
|
||||
left = bbox[0]
|
||||
top = bbox[1]
|
||||
|
||||
right = bbox[2]
|
||||
bottom = bbox[3]
|
||||
|
||||
roi_width = right - left
|
||||
roi_height = bottom - top
|
||||
|
||||
new_left = left - int(roi_width * enlarge_ratio)
|
||||
new_left = 0 if new_left < 0 else new_left
|
||||
|
||||
new_top = top - int(roi_height * enlarge_ratio)
|
||||
new_top = 0 if new_top < 0 else new_top
|
||||
|
||||
new_right = right + int(roi_width * enlarge_ratio)
|
||||
new_right = img_width if new_right > img_width else new_right
|
||||
|
||||
new_bottom = bottom + int(roi_height * enlarge_ratio)
|
||||
new_bottom = img_height if new_bottom > img_height else new_bottom
|
||||
|
||||
bbox = [new_left, new_top, new_right, new_bottom]
|
||||
|
||||
bbox = [int(x) for x in bbox]
|
||||
|
||||
return bbox
|
||||
|
||||
|
||||
def get_map_fusion_map_cuda(map_list, threshold=1, device=torch.device('cpu')):
|
||||
map_list_cuda = [torch.from_numpy(x).to(device) for x in map_list]
|
||||
map_concat = torch.stack(tuple(map_list_cuda), dim=-1)
|
||||
|
||||
map_concat = torch.abs(map_concat)
|
||||
|
||||
map_concat[map_concat < threshold] = 0
|
||||
map_concat[map_concat > 1e-5] = 1.0
|
||||
|
||||
sum_map = torch.sum(map_concat, dim=2)
|
||||
a = torch.ones_like(sum_map)
|
||||
acc_map = torch.where(sum_map > 0, a * 2.0, torch.zeros_like(sum_map))
|
||||
|
||||
fusion_map = torch.where(sum_map < 0.5, a * 1.5, sum_map)
|
||||
|
||||
fusion_map = fusion_map.float()
|
||||
acc_map = acc_map.float()
|
||||
|
||||
fusion_map = fusion_map.cpu().numpy().astype(np.float32)
|
||||
acc_map = acc_map.cpu().numpy().astype(np.float32)
|
||||
|
||||
return fusion_map, acc_map
|
||||
|
||||
|
||||
def gen_border_shade(height, width, height_band, width_band):
|
||||
height_ratio = height_band * 1.0 / height
|
||||
width_ratio = width_band * 1.0 / width
|
||||
|
||||
_height_band = int(256 * height_ratio)
|
||||
_width_band = int(256 * width_ratio)
|
||||
|
||||
canvas = np.zeros((256, 256), dtype=np.float32)
|
||||
|
||||
canvas[_height_band // 2:-_height_band // 2,
|
||||
_width_band // 2:-_width_band // 2] = 1.0
|
||||
|
||||
canvas = cv2.blur(canvas, (_height_band, _width_band))
|
||||
|
||||
canvas = cv2.resize(canvas, (width, height))
|
||||
|
||||
return canvas
|
||||
|
||||
|
||||
def get_mask_bbox(mask, threshold=127):
|
||||
ret, mask = cv2.threshold(mask, threshold, 1, 0)
|
||||
|
||||
if cv2.countNonZero(mask) == 0:
|
||||
return [None, None, None, None]
|
||||
|
||||
col_acc = np.sum(mask, 0)
|
||||
row_acc = np.sum(mask, 1)
|
||||
|
||||
col_acc = col_acc.tolist()
|
||||
row_acc = row_acc.tolist()
|
||||
|
||||
for x in range(len(col_acc)):
|
||||
if col_acc[x] > 0:
|
||||
left = x
|
||||
break
|
||||
|
||||
for x in range(1, len(col_acc)):
|
||||
if col_acc[-x] > 0:
|
||||
right = len(col_acc) - x
|
||||
break
|
||||
|
||||
for x in range(len(row_acc)):
|
||||
if row_acc[x] > 0:
|
||||
top = x
|
||||
break
|
||||
|
||||
for x in range(1, len(row_acc)):
|
||||
if row_acc[-x] > 0:
|
||||
bottom = len(row_acc[::-1]) - x
|
||||
break
|
||||
return [top, bottom, left, right]
|
||||
|
||||
|
||||
def visualize_flow(flow):
|
||||
h, w = flow.shape[:2]
|
||||
hsv = np.zeros((h, w, 3), np.uint8)
|
||||
mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
|
||||
|
||||
hsv[..., 0] = ang * 180 / np.pi / 2
|
||||
hsv[..., 1] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
|
||||
hsv[..., 2] = 255
|
||||
bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
|
||||
bgr = bgr * 1.0 / 255
|
||||
return bgr.astype(np.float32)
|
||||
|
||||
|
||||
def vis_joints(image, joints, color, show_text=True, confidence_threshold=0.1):
|
||||
|
||||
part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
|
||||
(3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]
|
||||
|
||||
abandon_idxs = [0, 1, 14, 15, 16, 17]
|
||||
# draw joints
|
||||
for i, joint in enumerate(joints):
|
||||
if i in abandon_idxs:
|
||||
continue
|
||||
if joint[-1] > confidence_threshold:
|
||||
|
||||
cv2.circle(image, (int(joint[0]), int(joint[1])), 1, color, 2)
|
||||
if show_text:
|
||||
cv2.putText(image,
|
||||
str(i) + '[{:.2f}]'.format(joint[-1]),
|
||||
(int(joint[0]), int(joint[1])),
|
||||
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
|
||||
# draw link
|
||||
for pair in part_orders:
|
||||
if joints[pair[0]][-1] > confidence_threshold and joints[
|
||||
pair[1]][-1] > confidence_threshold:
|
||||
cv2.line(image, (int(joints[pair[0]][0]), int(joints[pair[0]][1])),
|
||||
(int(joints[pair[1]][0]), int(joints[pair[1]][1])), color,
|
||||
2)
|
||||
return image
|
||||
|
||||
|
||||
def get_heatmap_cv(img, magn, max_flow_mag):
|
||||
min_flow_mag = .5
|
||||
cv_magn = np.clip(
|
||||
255 * (magn - min_flow_mag) / (max_flow_mag - min_flow_mag + 1e-7),
|
||||
a_min=0,
|
||||
a_max=255).astype(np.uint8)
|
||||
if img.dtype != np.uint8:
|
||||
img = (255 * img).astype(np.uint8)
|
||||
|
||||
heatmap_img = cv2.applyColorMap(cv_magn, cv2.COLORMAP_JET)
|
||||
heatmap_img = heatmap_img[..., ::-1]
|
||||
|
||||
h, w = magn.shape
|
||||
img_alpha = np.ones((h, w), dtype=np.double)[:, :, None]
|
||||
heatmap_alpha = np.clip(
|
||||
magn / (max_flow_mag + 1e-7), a_min=1e-7, a_max=1)[:, :, None]**.7
|
||||
heatmap_alpha[heatmap_alpha < .2]**.5
|
||||
pm_hm = heatmap_img * heatmap_alpha
|
||||
pm_img = img * img_alpha
|
||||
cv_out = pm_hm + pm_img * (1 - heatmap_alpha)
|
||||
cv_out = np.clip(cv_out, a_min=0, a_max=255).astype(np.uint8)
|
||||
|
||||
return cv_out
|
||||
|
||||
|
||||
def save_heatmap_cv(img, flow, supression=2):
|
||||
|
||||
flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2)
|
||||
flow_magn -= supression
|
||||
flow_magn[flow_magn <= 0] = 0
|
||||
cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3)
|
||||
return cv_out
|
||||
|
||||
|
||||
@numba.jit(nopython=True, parallel=False)
|
||||
def bilinear_interp(x, y, v11, v12, v21, v22):
|
||||
temp1 = (v11 * (1 - y) + v12 * y) * (1 - x)
|
||||
temp2 = (v21 * (1 - y) + v22 * y) * x
|
||||
result = temp1 + temp2
|
||||
return result
|
||||
|
||||
|
||||
@numba.jit(nopython=True, parallel=False)
|
||||
def image_warp_grid1(rDx, rDy, oriImg, transRatio, width_expand,
|
||||
height_expand):
|
||||
srcW = oriImg.shape[1]
|
||||
srcH = oriImg.shape[0]
|
||||
|
||||
newImg = oriImg.copy()
|
||||
|
||||
for i in range(srcH):
|
||||
for j in range(srcW):
|
||||
_i = i
|
||||
_j = j
|
||||
|
||||
deltaX = rDx[_i, _j]
|
||||
deltaY = rDy[_i, _j]
|
||||
|
||||
nx = _j + deltaX * transRatio
|
||||
ny = _i + deltaY * transRatio
|
||||
|
||||
if nx >= srcW - width_expand - 1:
|
||||
if nx > srcW - 1:
|
||||
nx = srcW - 1
|
||||
|
||||
if ny >= srcH - height_expand - 1:
|
||||
if ny > srcH - 1:
|
||||
ny = srcH - 1
|
||||
|
||||
if nx < width_expand:
|
||||
if nx < 0:
|
||||
nx = 0
|
||||
|
||||
if ny < height_expand:
|
||||
if ny < 0:
|
||||
ny = 0
|
||||
|
||||
nxi = int(math.floor(nx))
|
||||
nyi = int(math.floor(ny))
|
||||
nxi1 = int(math.ceil(nx))
|
||||
nyi1 = int(math.ceil(ny))
|
||||
|
||||
for ll in range(3):
|
||||
newImg[_i, _j,
|
||||
ll] = bilinear_interp(ny - nyi, nx - nxi,
|
||||
oriImg[nyi, nxi,
|
||||
ll], oriImg[nyi, nxi1, ll],
|
||||
oriImg[nyi1, nxi,
|
||||
ll], oriImg[nyi1, nxi1,
|
||||
ll])
|
||||
return newImg
|
||||
@@ -1,3 +1,6 @@
|
||||
# The implementation is adopted from Jingwen He,
|
||||
# made publicly available at https://github.com/hejingwenhejingwen/CSRNet
|
||||
|
||||
import functools
|
||||
import math
|
||||
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os.path as osp
|
||||
from copy import deepcopy
|
||||
from typing import Dict, Union
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
# ------------------------------------------------------------------------
|
||||
# Modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/models/archs/NAFNet_arch.py
|
||||
# Copyright (c) 2022 megvii-model. All Rights Reserved.
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
@@ -1,3 +1,8 @@
|
||||
# ------------------------------------------------------------------------
|
||||
# Modified from BasicSR (https://github.com/xinntao/BasicSR)
|
||||
# Copyright 2018-2020 BasicSR Authors
|
||||
# ------------------------------------------------------------------------
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
from copy import deepcopy
|
||||
from typing import Any, Dict, Union
|
||||
|
||||
import numpy as np
|
||||
import torch.cuda
|
||||
from torch.nn.parallel import DataParallel, DistributedDataParallel
|
||||
|
||||
@@ -77,13 +77,8 @@ class NAFNetForImageDenoise(TorchModel):
|
||||
def _evaluate_postprocess(self, input: Tensor,
|
||||
target: Tensor) -> Dict[str, list]:
|
||||
preds = self.model(input)
|
||||
preds = list(torch.split(preds, 1, 0))
|
||||
targets = list(torch.split(target, 1, 0))
|
||||
|
||||
preds = [(pred.data * 255.).squeeze(0).permute(
|
||||
1, 2, 0).cpu().numpy().astype(np.uint8) for pred in preds]
|
||||
targets = [(target.data * 255.).squeeze(0).permute(
|
||||
1, 2, 0).cpu().numpy().astype(np.uint8) for target in targets]
|
||||
preds = list(torch.split(preds.clamp(0, 1), 1, 0))
|
||||
targets = list(torch.split(target.clamp(0, 1), 1, 0))
|
||||
|
||||
return {'pred': preds, 'target': targets}
|
||||
|
||||
|
||||
@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
|
||||
from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .image_denoise_dataset import PairedImageDataset
|
||||
from .model import FFTInpainting
|
||||
|
||||
else:
|
||||
_import_structure = {
|
||||
'image_denoise_dataset': ['PairedImageDataset'],
|
||||
'model': ['FFTInpainting'],
|
||||
}
|
||||
|
||||
import sys
|
||||
75
modelscope/models/cv/image_inpainting/base.py
Normal file
75
modelscope/models/cv/image_inpainting/base.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
from typing import Dict, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .modules.adversarial import NonSaturatingWithR1
|
||||
from .modules.ffc import FFCResNetGenerator
|
||||
from .modules.perceptual import ResNetPL
|
||||
from .modules.pix2pixhd import NLayerDiscriminator
|
||||
|
||||
LOGGER = get_logger()
|
||||
|
||||
|
||||
class BaseInpaintingTrainingModule(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
model_dir='',
|
||||
use_ddp=True,
|
||||
predict_only=False,
|
||||
visualize_each_iters=100,
|
||||
average_generator=False,
|
||||
generator_avg_beta=0.999,
|
||||
average_generator_start_step=30000,
|
||||
average_generator_period=10,
|
||||
store_discr_outputs_for_vis=False,
|
||||
**kwargs):
|
||||
super().__init__()
|
||||
LOGGER.info(
|
||||
f'BaseInpaintingTrainingModule init called, predict_only is {predict_only}'
|
||||
)
|
||||
|
||||
self.generator = FFCResNetGenerator()
|
||||
self.use_ddp = use_ddp
|
||||
|
||||
if not predict_only:
|
||||
self.discriminator = NLayerDiscriminator()
|
||||
self.adversarial_loss = NonSaturatingWithR1(
|
||||
weight=10,
|
||||
gp_coef=0.001,
|
||||
mask_as_fake_target=True,
|
||||
allow_scale_mask=True)
|
||||
|
||||
self.average_generator = average_generator
|
||||
self.generator_avg_beta = generator_avg_beta
|
||||
self.average_generator_start_step = average_generator_start_step
|
||||
self.average_generator_period = average_generator_period
|
||||
self.generator_average = None
|
||||
self.last_generator_averaging_step = -1
|
||||
self.store_discr_outputs_for_vis = store_discr_outputs_for_vis
|
||||
|
||||
self.loss_l1 = nn.L1Loss(reduction='none')
|
||||
|
||||
self.loss_resnet_pl = ResNetPL(weight=30, weights_path=model_dir)
|
||||
|
||||
self.visualize_each_iters = visualize_each_iters
|
||||
LOGGER.info('BaseInpaintingTrainingModule init done')
|
||||
|
||||
def forward(self, batch: Dict[str,
|
||||
torch.Tensor]) -> Dict[str, torch.Tensor]:
|
||||
"""Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys"""
|
||||
raise NotImplementedError()
|
||||
|
||||
def generator_loss(self,
|
||||
batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
|
||||
raise NotImplementedError()
|
||||
|
||||
def discriminator_loss(
|
||||
self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
|
||||
raise NotImplementedError()
|
||||
210
modelscope/models/cv/image_inpainting/default.py
Normal file
210
modelscope/models/cv/image_inpainting/default.py
Normal file
@@ -0,0 +1,210 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
import bisect
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
from modelscope.utils.logger import get_logger
|
||||
from .base import BaseInpaintingTrainingModule
|
||||
from .modules.feature_matching import feature_matching_loss, masked_l1_loss
|
||||
|
||||
LOGGER = get_logger()
|
||||
|
||||
|
||||
def set_requires_grad(module, value):
|
||||
for param in module.parameters():
|
||||
param.requires_grad = value
|
||||
|
||||
|
||||
def add_prefix_to_keys(dct, prefix):
|
||||
return {prefix + k: v for k, v in dct.items()}
|
||||
|
||||
|
||||
class LinearRamp:
|
||||
|
||||
def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
|
||||
self.start_value = start_value
|
||||
self.end_value = end_value
|
||||
self.start_iter = start_iter
|
||||
self.end_iter = end_iter
|
||||
|
||||
def __call__(self, i):
|
||||
if i < self.start_iter:
|
||||
return self.start_value
|
||||
if i >= self.end_iter:
|
||||
return self.end_value
|
||||
part = (i - self.start_iter) / (self.end_iter - self.start_iter)
|
||||
return self.start_value * (1 - part) + self.end_value * part
|
||||
|
||||
|
||||
class LadderRamp:
|
||||
|
||||
def __init__(self, start_iters, values):
|
||||
self.start_iters = start_iters
|
||||
self.values = values
|
||||
assert len(values) == len(start_iters) + 1, (len(values),
|
||||
len(start_iters))
|
||||
|
||||
def __call__(self, i):
|
||||
segment_i = bisect.bisect_right(self.start_iters, i)
|
||||
return self.values[segment_i]
|
||||
|
||||
|
||||
def get_ramp(kind='ladder', **kwargs):
|
||||
if kind == 'linear':
|
||||
return LinearRamp(**kwargs)
|
||||
if kind == 'ladder':
|
||||
return LadderRamp(**kwargs)
|
||||
raise ValueError(f'Unexpected ramp kind: {kind}')
|
||||
|
||||
|
||||
class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule):
|
||||
|
||||
def __init__(self,
|
||||
model_dir='',
|
||||
predict_only=False,
|
||||
concat_mask=True,
|
||||
rescale_scheduler_kwargs=None,
|
||||
image_to_discriminator='predicted_image',
|
||||
add_noise_kwargs=None,
|
||||
noise_fill_hole=False,
|
||||
const_area_crop_kwargs=None,
|
||||
distance_weighter_kwargs=None,
|
||||
distance_weighted_mask_for_discr=False,
|
||||
fake_fakes_proba=0,
|
||||
fake_fakes_generator_kwargs=None,
|
||||
**kwargs):
|
||||
super().__init__(model_dir=model_dir, predict_only=predict_only)
|
||||
self.concat_mask = concat_mask
|
||||
self.rescale_size_getter = get_ramp(
|
||||
**rescale_scheduler_kwargs
|
||||
) if rescale_scheduler_kwargs is not None else None
|
||||
self.image_to_discriminator = image_to_discriminator
|
||||
self.add_noise_kwargs = add_noise_kwargs
|
||||
self.noise_fill_hole = noise_fill_hole
|
||||
self.const_area_crop_kwargs = const_area_crop_kwargs
|
||||
self.refine_mask_for_losses = None
|
||||
self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr
|
||||
|
||||
self.feature_matching_weight = 100
|
||||
self.losses_l1_weight_known = 10
|
||||
self.losses_l1_weight_missing = 0
|
||||
self.fake_fakes_proba = fake_fakes_proba
|
||||
|
||||
def forward(self, batch):
|
||||
img = batch['image']
|
||||
mask = batch['mask']
|
||||
|
||||
masked_img = img * (1 - mask)
|
||||
|
||||
if self.concat_mask:
|
||||
masked_img = torch.cat([masked_img, mask], dim=1)
|
||||
|
||||
batch['predicted_image'] = self.generator(masked_img)
|
||||
batch['inpainted'] = mask * batch['predicted_image'] + (
|
||||
1 - mask) * batch['image']
|
||||
|
||||
batch['mask_for_losses'] = mask
|
||||
|
||||
return batch
|
||||
|
||||
def generator_loss(self, batch):
|
||||
img = batch['image']
|
||||
predicted_img = batch[self.image_to_discriminator]
|
||||
original_mask = batch['mask']
|
||||
supervised_mask = batch['mask_for_losses']
|
||||
|
||||
# L1
|
||||
l1_value = masked_l1_loss(predicted_img, img, supervised_mask,
|
||||
self.losses_l1_weight_known,
|
||||
self.losses_l1_weight_missing)
|
||||
|
||||
total_loss = l1_value
|
||||
metrics = dict(gen_l1=l1_value)
|
||||
|
||||
# discriminator
|
||||
# adversarial_loss calls backward by itself
|
||||
mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask
|
||||
self.adversarial_loss.pre_generator_step(
|
||||
real_batch=img,
|
||||
fake_batch=predicted_img,
|
||||
generator=self.generator,
|
||||
discriminator=self.discriminator)
|
||||
discr_real_pred, discr_real_features = self.discriminator(img)
|
||||
discr_fake_pred, discr_fake_features = self.discriminator(
|
||||
predicted_img)
|
||||
adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss(
|
||||
real_batch=img,
|
||||
fake_batch=predicted_img,
|
||||
discr_real_pred=discr_real_pred,
|
||||
discr_fake_pred=discr_fake_pred,
|
||||
mask=mask_for_discr)
|
||||
total_loss = total_loss + adv_gen_loss
|
||||
metrics['gen_adv'] = adv_gen_loss
|
||||
metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
|
||||
|
||||
# feature matching
|
||||
if self.feature_matching_weight > 0:
|
||||
need_mask_in_fm = False
|
||||
mask_for_fm = supervised_mask if need_mask_in_fm else None
|
||||
fm_value = feature_matching_loss(
|
||||
discr_fake_features, discr_real_features,
|
||||
mask=mask_for_fm) * self.feature_matching_weight
|
||||
total_loss = total_loss + fm_value
|
||||
metrics['gen_fm'] = fm_value
|
||||
|
||||
if self.loss_resnet_pl is not None:
|
||||
resnet_pl_value = self.loss_resnet_pl(predicted_img, img)
|
||||
total_loss = total_loss + resnet_pl_value
|
||||
metrics['gen_resnet_pl'] = resnet_pl_value
|
||||
|
||||
return total_loss, metrics
|
||||
|
||||
def discriminator_loss(self, batch):
|
||||
total_loss = 0
|
||||
metrics = {}
|
||||
|
||||
predicted_img = batch[self.image_to_discriminator].detach()
|
||||
self.adversarial_loss.pre_discriminator_step(
|
||||
real_batch=batch['image'],
|
||||
fake_batch=predicted_img,
|
||||
generator=self.generator,
|
||||
discriminator=self.discriminator)
|
||||
discr_real_pred, discr_real_features = self.discriminator(
|
||||
batch['image'])
|
||||
discr_fake_pred, discr_fake_features = self.discriminator(
|
||||
predicted_img)
|
||||
adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss(
|
||||
real_batch=batch['image'],
|
||||
fake_batch=predicted_img,
|
||||
discr_real_pred=discr_real_pred,
|
||||
discr_fake_pred=discr_fake_pred,
|
||||
mask=batch['mask'])
|
||||
|
||||
total_loss = (total_loss + adv_discr_loss) * 0.1
|
||||
metrics['discr_adv'] = adv_discr_loss
|
||||
metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
|
||||
|
||||
return total_loss, metrics
|
||||
|
||||
def _do_step(self, batch, optimizer_idx=None):
|
||||
if optimizer_idx == 0: # step for generator
|
||||
set_requires_grad(self.generator, True)
|
||||
set_requires_grad(self.discriminator, False)
|
||||
elif optimizer_idx == 1: # step for discriminator
|
||||
set_requires_grad(self.generator, False)
|
||||
set_requires_grad(self.discriminator, True)
|
||||
|
||||
batch = self(batch)
|
||||
total_loss = 0
|
||||
if optimizer_idx is None or optimizer_idx == 0: # step for generator
|
||||
total_loss, metrics = self.generator_loss(batch)
|
||||
|
||||
elif optimizer_idx is None or optimizer_idx == 1: # step for discriminator
|
||||
total_loss, metrics = self.discriminator_loss(batch)
|
||||
|
||||
result = dict(loss=total_loss)
|
||||
return result
|
||||
36
modelscope/models/cv/image_inpainting/model.py
Normal file
36
modelscope/models/cv/image_inpainting/model.py
Normal file
@@ -0,0 +1,36 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import os
|
||||
from typing import Any, Dict, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base.base_torch_model import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
LOGGER = get_logger()
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
Tasks.image_inpainting, module_name=Models.image_inpainting)
|
||||
class FFTInpainting(TorchModel):
|
||||
|
||||
def __init__(self, model_dir: str, **kwargs):
|
||||
super().__init__(model_dir, **kwargs)
|
||||
|
||||
from .default import DefaultInpaintingTrainingModule
|
||||
pretrained = kwargs.get('pretrained', True)
|
||||
predict_only = kwargs.get('predict_only', False)
|
||||
net = DefaultInpaintingTrainingModule(
|
||||
model_dir=model_dir, predict_only=predict_only)
|
||||
if pretrained:
|
||||
path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
|
||||
LOGGER.info(f'loading pretrained model from {path}')
|
||||
state = torch.load(path, map_location='cpu')
|
||||
net.load_state_dict(state, strict=False)
|
||||
self.model = net
|
||||
|
||||
def forward(self, inputs):
|
||||
return self.model(inputs)
|
||||
@@ -0,0 +1,2 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
from .base import ModelBuilder
|
||||
380
modelscope/models/cv/image_inpainting/modules/ade20k/base.py
Normal file
380
modelscope/models/cv/image_inpainting/modules/ade20k/base.py
Normal file
@@ -0,0 +1,380 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torch.nn.modules import BatchNorm2d
|
||||
|
||||
from . import resnet
|
||||
|
||||
NUM_CLASS = 150
|
||||
|
||||
|
||||
# Model Builder
|
||||
class ModelBuilder:
|
||||
# custom weights initialization
|
||||
@staticmethod
|
||||
def weights_init(m):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find('Conv') != -1:
|
||||
nn.init.kaiming_normal_(m.weight.data)
|
||||
elif classname.find('BatchNorm') != -1:
|
||||
m.weight.data.fill_(1.)
|
||||
m.bias.data.fill_(1e-4)
|
||||
|
||||
@staticmethod
|
||||
def build_encoder(arch='resnet50dilated',
|
||||
fc_dim=512,
|
||||
weights='',
|
||||
model_dir=''):
|
||||
pretrained = True if len(weights) == 0 else False
|
||||
arch = arch.lower()
|
||||
if arch == 'resnet50dilated':
|
||||
orig_resnet = resnet.__dict__['resnet50'](
|
||||
pretrained=pretrained, model_dir=model_dir)
|
||||
net_encoder = ResnetDilated(orig_resnet, dilate_scale=8)
|
||||
elif arch == 'resnet50':
|
||||
orig_resnet = resnet.__dict__['resnet50'](
|
||||
pretrained=pretrained, model_dir=model_dir)
|
||||
net_encoder = Resnet(orig_resnet)
|
||||
else:
|
||||
raise Exception('Architecture undefined!')
|
||||
|
||||
# encoders are usually pretrained
|
||||
# net_encoder.apply(ModelBuilder.weights_init)
|
||||
if len(weights) > 0:
|
||||
print('Loading weights for net_encoder')
|
||||
net_encoder.load_state_dict(
|
||||
torch.load(weights, map_location=lambda storage, loc: storage),
|
||||
strict=False)
|
||||
return net_encoder
|
||||
|
||||
@staticmethod
|
||||
def build_decoder(arch='ppm_deepsup',
|
||||
fc_dim=512,
|
||||
num_class=NUM_CLASS,
|
||||
weights='',
|
||||
use_softmax=False,
|
||||
drop_last_conv=False):
|
||||
arch = arch.lower()
|
||||
if arch == 'ppm_deepsup':
|
||||
net_decoder = PPMDeepsup(
|
||||
num_class=num_class,
|
||||
fc_dim=fc_dim,
|
||||
use_softmax=use_softmax,
|
||||
drop_last_conv=drop_last_conv)
|
||||
elif arch == 'c1_deepsup':
|
||||
net_decoder = C1DeepSup(
|
||||
num_class=num_class,
|
||||
fc_dim=fc_dim,
|
||||
use_softmax=use_softmax,
|
||||
drop_last_conv=drop_last_conv)
|
||||
else:
|
||||
raise Exception('Architecture undefined!')
|
||||
|
||||
net_decoder.apply(ModelBuilder.weights_init)
|
||||
if len(weights) > 0:
|
||||
print('Loading weights for net_decoder')
|
||||
net_decoder.load_state_dict(
|
||||
torch.load(weights, map_location=lambda storage, loc: storage),
|
||||
strict=False)
|
||||
return net_decoder
|
||||
|
||||
@staticmethod
|
||||
def get_decoder(weights_path, arch_encoder, arch_decoder, fc_dim,
|
||||
drop_last_conv, *arts, **kwargs):
|
||||
path = os.path.join(
|
||||
weights_path, 'ade20k',
|
||||
f'ade20k-{arch_encoder}-{arch_decoder}/decoder_epoch_20.pth')
|
||||
return ModelBuilder.build_decoder(
|
||||
arch=arch_decoder,
|
||||
fc_dim=fc_dim,
|
||||
weights=path,
|
||||
use_softmax=True,
|
||||
drop_last_conv=drop_last_conv)
|
||||
|
||||
@staticmethod
|
||||
def get_encoder(weights_path, arch_encoder, arch_decoder, fc_dim,
|
||||
segmentation, *arts, **kwargs):
|
||||
if segmentation:
|
||||
path = os.path.join(
|
||||
weights_path, 'ade20k',
|
||||
f'ade20k-{arch_encoder}-{arch_decoder}/encoder_epoch_20.pth')
|
||||
else:
|
||||
path = ''
|
||||
return ModelBuilder.build_encoder(
|
||||
arch=arch_encoder,
|
||||
fc_dim=fc_dim,
|
||||
weights=path,
|
||||
model_dir=weights_path)
|
||||
|
||||
|
||||
def conv3x3_bn_relu(in_planes, out_planes, stride=1):
|
||||
return nn.Sequential(
|
||||
nn.Conv2d(
|
||||
in_planes,
|
||||
out_planes,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
bias=False),
|
||||
BatchNorm2d(out_planes),
|
||||
nn.ReLU(inplace=True),
|
||||
)
|
||||
|
||||
|
||||
# pyramid pooling, deep supervision
|
||||
class PPMDeepsup(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
num_class=NUM_CLASS,
|
||||
fc_dim=4096,
|
||||
use_softmax=False,
|
||||
pool_scales=(1, 2, 3, 6),
|
||||
drop_last_conv=False):
|
||||
super().__init__()
|
||||
self.use_softmax = use_softmax
|
||||
self.drop_last_conv = drop_last_conv
|
||||
|
||||
self.ppm = []
|
||||
for scale in pool_scales:
|
||||
self.ppm.append(
|
||||
nn.Sequential(
|
||||
nn.AdaptiveAvgPool2d(scale),
|
||||
nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False),
|
||||
BatchNorm2d(512), nn.ReLU(inplace=True)))
|
||||
self.ppm = nn.ModuleList(self.ppm)
|
||||
self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)
|
||||
|
||||
self.conv_last = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
fc_dim + len(pool_scales) * 512,
|
||||
512,
|
||||
kernel_size=3,
|
||||
padding=1,
|
||||
bias=False), BatchNorm2d(512), nn.ReLU(inplace=True),
|
||||
nn.Dropout2d(0.1), nn.Conv2d(512, num_class, kernel_size=1))
|
||||
self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
|
||||
self.dropout_deepsup = nn.Dropout2d(0.1)
|
||||
|
||||
def forward(self, conv_out, segSize=None):
|
||||
conv5 = conv_out[-1]
|
||||
|
||||
input_size = conv5.size()
|
||||
ppm_out = [conv5]
|
||||
for pool_scale in self.ppm:
|
||||
ppm_out.append(
|
||||
nn.functional.interpolate(
|
||||
pool_scale(conv5), (input_size[2], input_size[3]),
|
||||
mode='bilinear',
|
||||
align_corners=False))
|
||||
ppm_out = torch.cat(ppm_out, 1)
|
||||
|
||||
if self.drop_last_conv:
|
||||
return ppm_out
|
||||
else:
|
||||
x = self.conv_last(ppm_out)
|
||||
|
||||
if self.use_softmax: # is True during inference
|
||||
x = nn.functional.interpolate(
|
||||
x, size=segSize, mode='bilinear', align_corners=False)
|
||||
x = nn.functional.softmax(x, dim=1)
|
||||
return x
|
||||
|
||||
# deep sup
|
||||
conv4 = conv_out[-2]
|
||||
_ = self.cbr_deepsup(conv4)
|
||||
_ = self.dropout_deepsup(_)
|
||||
_ = self.conv_last_deepsup(_)
|
||||
|
||||
x = nn.functional.log_softmax(x, dim=1)
|
||||
_ = nn.functional.log_softmax(_, dim=1)
|
||||
|
||||
return (x, _)
|
||||
|
||||
|
||||
class Resnet(nn.Module):
|
||||
|
||||
def __init__(self, orig_resnet):
|
||||
super(Resnet, self).__init__()
|
||||
|
||||
# take pretrained resnet, except AvgPool and FC
|
||||
self.conv1 = orig_resnet.conv1
|
||||
self.bn1 = orig_resnet.bn1
|
||||
self.relu1 = orig_resnet.relu1
|
||||
self.conv2 = orig_resnet.conv2
|
||||
self.bn2 = orig_resnet.bn2
|
||||
self.relu2 = orig_resnet.relu2
|
||||
self.conv3 = orig_resnet.conv3
|
||||
self.bn3 = orig_resnet.bn3
|
||||
self.relu3 = orig_resnet.relu3
|
||||
self.maxpool = orig_resnet.maxpool
|
||||
self.layer1 = orig_resnet.layer1
|
||||
self.layer2 = orig_resnet.layer2
|
||||
self.layer3 = orig_resnet.layer3
|
||||
self.layer4 = orig_resnet.layer4
|
||||
|
||||
def forward(self, x, return_feature_maps=False):
|
||||
conv_out = []
|
||||
|
||||
x = self.relu1(self.bn1(self.conv1(x)))
|
||||
x = self.relu2(self.bn2(self.conv2(x)))
|
||||
x = self.relu3(self.bn3(self.conv3(x)))
|
||||
x = self.maxpool(x)
|
||||
|
||||
x = self.layer1(x)
|
||||
conv_out.append(x)
|
||||
x = self.layer2(x)
|
||||
conv_out.append(x)
|
||||
x = self.layer3(x)
|
||||
conv_out.append(x)
|
||||
x = self.layer4(x)
|
||||
conv_out.append(x)
|
||||
|
||||
if return_feature_maps:
|
||||
return conv_out
|
||||
return [x]
|
||||
|
||||
|
||||
# Resnet Dilated
|
||||
class ResnetDilated(nn.Module):
|
||||
|
||||
def __init__(self, orig_resnet, dilate_scale=8):
|
||||
super().__init__()
|
||||
from functools import partial
|
||||
|
||||
if dilate_scale == 8:
|
||||
orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2))
|
||||
orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4))
|
||||
elif dilate_scale == 16:
|
||||
orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2))
|
||||
|
||||
# take pretrained resnet, except AvgPool and FC
|
||||
self.conv1 = orig_resnet.conv1
|
||||
self.bn1 = orig_resnet.bn1
|
||||
self.relu1 = orig_resnet.relu1
|
||||
self.conv2 = orig_resnet.conv2
|
||||
self.bn2 = orig_resnet.bn2
|
||||
self.relu2 = orig_resnet.relu2
|
||||
self.conv3 = orig_resnet.conv3
|
||||
self.bn3 = orig_resnet.bn3
|
||||
self.relu3 = orig_resnet.relu3
|
||||
self.maxpool = orig_resnet.maxpool
|
||||
self.layer1 = orig_resnet.layer1
|
||||
self.layer2 = orig_resnet.layer2
|
||||
self.layer3 = orig_resnet.layer3
|
||||
self.layer4 = orig_resnet.layer4
|
||||
|
||||
def _nostride_dilate(self, m, dilate):
|
||||
classname = m.__class__.__name__
|
||||
if classname.find('Conv') != -1:
|
||||
# the convolution with stride
|
||||
if m.stride == (2, 2):
|
||||
m.stride = (1, 1)
|
||||
if m.kernel_size == (3, 3):
|
||||
m.dilation = (dilate // 2, dilate // 2)
|
||||
m.padding = (dilate // 2, dilate // 2)
|
||||
# other convoluions
|
||||
else:
|
||||
if m.kernel_size == (3, 3):
|
||||
m.dilation = (dilate, dilate)
|
||||
m.padding = (dilate, dilate)
|
||||
|
||||
def forward(self, x, return_feature_maps=False):
|
||||
conv_out = []
|
||||
|
||||
x = self.relu1(self.bn1(self.conv1(x)))
|
||||
x = self.relu2(self.bn2(self.conv2(x)))
|
||||
x = self.relu3(self.bn3(self.conv3(x)))
|
||||
x = self.maxpool(x)
|
||||
|
||||
x = self.layer1(x)
|
||||
conv_out.append(x)
|
||||
x = self.layer2(x)
|
||||
conv_out.append(x)
|
||||
x = self.layer3(x)
|
||||
conv_out.append(x)
|
||||
x = self.layer4(x)
|
||||
conv_out.append(x)
|
||||
|
||||
if return_feature_maps:
|
||||
return conv_out
|
||||
return [x]
|
||||
|
||||
|
||||
# last conv, deep supervision
|
||||
class C1DeepSup(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
num_class=150,
|
||||
fc_dim=2048,
|
||||
use_softmax=False,
|
||||
drop_last_conv=False):
|
||||
super(C1DeepSup, self).__init__()
|
||||
self.use_softmax = use_softmax
|
||||
self.drop_last_conv = drop_last_conv
|
||||
|
||||
self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
|
||||
self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)
|
||||
|
||||
# last conv
|
||||
self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
|
||||
self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
|
||||
|
||||
def forward(self, conv_out, segSize=None):
|
||||
conv5 = conv_out[-1]
|
||||
|
||||
x = self.cbr(conv5)
|
||||
|
||||
if self.drop_last_conv:
|
||||
return x
|
||||
else:
|
||||
x = self.conv_last(x)
|
||||
|
||||
if self.use_softmax: # is True during inference
|
||||
x = nn.functional.interpolate(
|
||||
x, size=segSize, mode='bilinear', align_corners=False)
|
||||
x = nn.functional.softmax(x, dim=1)
|
||||
return x
|
||||
|
||||
# deep sup
|
||||
conv4 = conv_out[-2]
|
||||
_ = self.cbr_deepsup(conv4)
|
||||
_ = self.conv_last_deepsup(_)
|
||||
|
||||
x = nn.functional.log_softmax(x, dim=1)
|
||||
_ = nn.functional.log_softmax(_, dim=1)
|
||||
|
||||
return (x, _)
|
||||
|
||||
|
||||
# last conv
|
||||
class C1(nn.Module):
|
||||
|
||||
def __init__(self, num_class=150, fc_dim=2048, use_softmax=False):
|
||||
super(C1, self).__init__()
|
||||
self.use_softmax = use_softmax
|
||||
|
||||
self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
|
||||
|
||||
# last conv
|
||||
self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
|
||||
|
||||
def forward(self, conv_out, segSize=None):
|
||||
conv5 = conv_out[-1]
|
||||
x = self.cbr(conv5)
|
||||
x = self.conv_last(x)
|
||||
|
||||
if self.use_softmax: # is True during inference
|
||||
x = nn.functional.interpolate(
|
||||
x, size=segSize, mode='bilinear', align_corners=False)
|
||||
x = nn.functional.softmax(x, dim=1)
|
||||
else:
|
||||
x = nn.functional.log_softmax(x, dim=1)
|
||||
|
||||
return x
|
||||
183
modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
Normal file
183
modelscope/models/cv/image_inpainting/modules/ade20k/resnet.py
Normal file
@@ -0,0 +1,183 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
import math
|
||||
import os
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.nn import BatchNorm2d
|
||||
|
||||
__all__ = ['ResNet', 'resnet50']
|
||||
|
||||
|
||||
def conv3x3(in_planes, out_planes, stride=1):
|
||||
'3x3 convolution with padding'
|
||||
return nn.Conv2d(
|
||||
in_planes,
|
||||
out_planes,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
bias=False)
|
||||
|
||||
|
||||
class BasicBlock(nn.Module):
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv1 = conv3x3(inplanes, planes, stride)
|
||||
self.bn1 = BatchNorm2d(planes)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.conv2 = conv3x3(planes, planes)
|
||||
self.bn2 = BatchNorm2d(planes)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class Bottleneck(nn.Module):
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, inplanes, planes, stride=1, downsample=None):
|
||||
super(Bottleneck, self).__init__()
|
||||
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
|
||||
self.bn1 = BatchNorm2d(planes)
|
||||
self.conv2 = nn.Conv2d(
|
||||
planes,
|
||||
planes,
|
||||
kernel_size=3,
|
||||
stride=stride,
|
||||
padding=1,
|
||||
bias=False)
|
||||
self.bn2 = BatchNorm2d(planes)
|
||||
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
|
||||
self.bn3 = BatchNorm2d(planes * 4)
|
||||
self.relu = nn.ReLU(inplace=True)
|
||||
self.downsample = downsample
|
||||
self.stride = stride
|
||||
|
||||
def forward(self, x):
|
||||
residual = x
|
||||
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.downsample is not None:
|
||||
residual = self.downsample(x)
|
||||
|
||||
out += residual
|
||||
out = self.relu(out)
|
||||
|
||||
return out
|
||||
|
||||
|
||||
class ResNet(nn.Module):
|
||||
|
||||
def __init__(self, block, layers, num_classes=1000):
|
||||
self.inplanes = 128
|
||||
super(ResNet, self).__init__()
|
||||
self.conv1 = conv3x3(3, 64, stride=2)
|
||||
self.bn1 = BatchNorm2d(64)
|
||||
self.relu1 = nn.ReLU(inplace=True)
|
||||
self.conv2 = conv3x3(64, 64)
|
||||
self.bn2 = BatchNorm2d(64)
|
||||
self.relu2 = nn.ReLU(inplace=True)
|
||||
self.conv3 = conv3x3(64, 128)
|
||||
self.bn3 = BatchNorm2d(128)
|
||||
self.relu3 = nn.ReLU(inplace=True)
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
|
||||
|
||||
self.layer1 = self._make_layer(block, 64, layers[0])
|
||||
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
|
||||
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
|
||||
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
|
||||
self.avgpool = nn.AvgPool2d(7, stride=1)
|
||||
self.fc = nn.Linear(512 * block.expansion, num_classes)
|
||||
|
||||
for m in self.modules():
|
||||
if isinstance(m, nn.Conv2d):
|
||||
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
|
||||
m.weight.data.normal_(0, math.sqrt(2. / n))
|
||||
elif isinstance(m, BatchNorm2d):
|
||||
m.weight.data.fill_(1)
|
||||
m.bias.data.zero_()
|
||||
|
||||
def _make_layer(self, block, planes, blocks, stride=1):
|
||||
downsample = None
|
||||
if stride != 1 or self.inplanes != planes * block.expansion:
|
||||
downsample = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
self.inplanes,
|
||||
planes * block.expansion,
|
||||
kernel_size=1,
|
||||
stride=stride,
|
||||
bias=False),
|
||||
BatchNorm2d(planes * block.expansion),
|
||||
)
|
||||
|
||||
layers = []
|
||||
layers.append(block(self.inplanes, planes, stride, downsample))
|
||||
self.inplanes = planes * block.expansion
|
||||
for i in range(1, blocks):
|
||||
layers.append(block(self.inplanes, planes))
|
||||
|
||||
return nn.Sequential(*layers)
|
||||
|
||||
def forward(self, x):
|
||||
x = self.relu1(self.bn1(self.conv1(x)))
|
||||
x = self.relu2(self.bn2(self.conv2(x)))
|
||||
x = self.relu3(self.bn3(self.conv3(x)))
|
||||
x = self.maxpool(x)
|
||||
|
||||
x = self.layer1(x)
|
||||
x = self.layer2(x)
|
||||
x = self.layer3(x)
|
||||
x = self.layer4(x)
|
||||
|
||||
x = self.avgpool(x)
|
||||
x = x.view(x.size(0), -1)
|
||||
x = self.fc(x)
|
||||
|
||||
return x
|
||||
|
||||
|
||||
def resnet50(pretrained=False, model_dir='', **kwargs):
|
||||
"""Constructs a ResNet-50 model.
|
||||
|
||||
Args:
|
||||
pretrained (bool): If True, returns a model pre-trained on ImageNet
|
||||
"""
|
||||
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
|
||||
if pretrained:
|
||||
cached_file = os.path.join(model_dir, 'resnet50-imagenet.pth')
|
||||
model.load_state_dict(
|
||||
torch.load(cached_file, map_location='cpu'), strict=False)
|
||||
return model
|
||||
167
modelscope/models/cv/image_inpainting/modules/adversarial.py
Normal file
167
modelscope/models/cv/image_inpainting/modules/adversarial.py
Normal file
@@ -0,0 +1,167 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
class BaseAdversarialLoss:
|
||||
|
||||
def pre_generator_step(self, real_batch: torch.Tensor,
|
||||
fake_batch: torch.Tensor, generator: nn.Module,
|
||||
discriminator: nn.Module):
|
||||
"""
|
||||
Prepare for generator step
|
||||
:param real_batch: Tensor, a batch of real samples
|
||||
:param fake_batch: Tensor, a batch of samples produced by generator
|
||||
:param generator:
|
||||
:param discriminator:
|
||||
:return: None
|
||||
"""
|
||||
|
||||
def pre_discriminator_step(self, real_batch: torch.Tensor,
|
||||
fake_batch: torch.Tensor, generator: nn.Module,
|
||||
discriminator: nn.Module):
|
||||
"""
|
||||
Prepare for discriminator step
|
||||
:param real_batch: Tensor, a batch of real samples
|
||||
:param fake_batch: Tensor, a batch of samples produced by generator
|
||||
:param generator:
|
||||
:param discriminator:
|
||||
:return: None
|
||||
"""
|
||||
|
||||
def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
|
||||
discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
|
||||
mask: Optional[torch.Tensor] = None) \
|
||||
-> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
|
||||
"""
|
||||
Calculate generator loss
|
||||
:param real_batch: Tensor, a batch of real samples
|
||||
:param fake_batch: Tensor, a batch of samples produced by generator
|
||||
:param discr_real_pred: Tensor, discriminator output for real_batch
|
||||
:param discr_fake_pred: Tensor, discriminator output for fake_batch
|
||||
:param mask: Tensor, actual mask, which was at input of generator when making fake_batch
|
||||
:return: total generator loss along with some values that might be interesting to log
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
|
||||
discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
|
||||
mask: Optional[torch.Tensor] = None) \
|
||||
-> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
|
||||
"""
|
||||
Calculate discriminator loss and call .backward() on it
|
||||
:param real_batch: Tensor, a batch of real samples
|
||||
:param fake_batch: Tensor, a batch of samples produced by generator
|
||||
:param discr_real_pred: Tensor, discriminator output for real_batch
|
||||
:param discr_fake_pred: Tensor, discriminator output for fake_batch
|
||||
:param mask: Tensor, actual mask, which was at input of generator when making fake_batch
|
||||
:return: total discriminator loss along with some values that might be interesting to log
|
||||
"""
|
||||
raise NotImplementedError
|
||||
|
||||
def interpolate_mask(self, mask, shape):
|
||||
assert mask is not None
|
||||
assert self.allow_scale_mask or shape == mask.shape[-2:]
|
||||
if shape != mask.shape[-2:] and self.allow_scale_mask:
|
||||
if self.mask_scale_mode == 'maxpool':
|
||||
mask = F.adaptive_max_pool2d(mask, shape)
|
||||
else:
|
||||
mask = F.interpolate(
|
||||
mask, size=shape, mode=self.mask_scale_mode)
|
||||
return mask
|
||||
|
||||
|
||||
def make_r1_gp(discr_real_pred, real_batch):
|
||||
if torch.is_grad_enabled():
|
||||
grad_real = torch.autograd.grad(
|
||||
outputs=discr_real_pred.sum(),
|
||||
inputs=real_batch,
|
||||
create_graph=True)[0]
|
||||
grad_penalty = (grad_real.view(grad_real.shape[0],
|
||||
-1).norm(2, dim=1)**2).mean()
|
||||
else:
|
||||
grad_penalty = 0
|
||||
real_batch.requires_grad = False
|
||||
|
||||
return grad_penalty
|
||||
|
||||
|
||||
class NonSaturatingWithR1(BaseAdversarialLoss):
|
||||
|
||||
def __init__(self,
|
||||
gp_coef=5,
|
||||
weight=1,
|
||||
mask_as_fake_target=False,
|
||||
allow_scale_mask=False,
|
||||
mask_scale_mode='nearest',
|
||||
extra_mask_weight_for_gen=0,
|
||||
use_unmasked_for_gen=True,
|
||||
use_unmasked_for_discr=True):
|
||||
self.gp_coef = gp_coef
|
||||
self.weight = weight
|
||||
# use for discr => use for gen;
|
||||
# otherwise we teach only the discr to pay attention to very small difference
|
||||
assert use_unmasked_for_gen or (not use_unmasked_for_discr)
|
||||
# mask as target => use unmasked for discr:
|
||||
# if we don't care about unmasked regions at all
|
||||
# then it doesn't matter if the value of mask_as_fake_target is true or false
|
||||
assert use_unmasked_for_discr or (not mask_as_fake_target)
|
||||
self.use_unmasked_for_gen = use_unmasked_for_gen
|
||||
self.use_unmasked_for_discr = use_unmasked_for_discr
|
||||
self.mask_as_fake_target = mask_as_fake_target
|
||||
self.allow_scale_mask = allow_scale_mask
|
||||
self.mask_scale_mode = mask_scale_mode
|
||||
self.extra_mask_weight_for_gen = extra_mask_weight_for_gen
|
||||
|
||||
def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
|
||||
discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
|
||||
mask=None) \
|
||||
-> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
|
||||
fake_loss = F.softplus(-discr_fake_pred)
|
||||
if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \
|
||||
not self.use_unmasked_for_gen: # == if masked region should be treated differently
|
||||
mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
|
||||
if not self.use_unmasked_for_gen:
|
||||
fake_loss = fake_loss * mask
|
||||
else:
|
||||
pixel_weights = 1 + mask * self.extra_mask_weight_for_gen
|
||||
fake_loss = fake_loss * pixel_weights
|
||||
|
||||
return fake_loss.mean() * self.weight, dict()
|
||||
|
||||
def pre_discriminator_step(self, real_batch: torch.Tensor,
|
||||
fake_batch: torch.Tensor, generator: nn.Module,
|
||||
discriminator: nn.Module):
|
||||
real_batch.requires_grad = True
|
||||
|
||||
def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
|
||||
discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
|
||||
mask=None) \
|
||||
-> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
|
||||
|
||||
real_loss = F.softplus(-discr_real_pred)
|
||||
grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef
|
||||
fake_loss = F.softplus(discr_fake_pred)
|
||||
|
||||
if not self.use_unmasked_for_discr or self.mask_as_fake_target:
|
||||
# == if masked region should be treated differently
|
||||
mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
|
||||
# use_unmasked_for_discr=False only makes sense for fakes;
|
||||
# for reals there is no difference beetween two regions
|
||||
fake_loss = fake_loss * mask
|
||||
if self.mask_as_fake_target:
|
||||
fake_loss = fake_loss + (1
|
||||
- mask) * F.softplus(-discr_fake_pred)
|
||||
|
||||
sum_discr_loss = real_loss + grad_penalty + fake_loss
|
||||
metrics = dict(
|
||||
discr_real_out=discr_real_pred.mean(),
|
||||
discr_fake_out=discr_fake_pred.mean(),
|
||||
discr_real_gp=grad_penalty)
|
||||
return sum_discr_loss.mean(), metrics
|
||||
@@ -0,0 +1,45 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
from typing import List
|
||||
|
||||
import torch
|
||||
import torch.nn.functional as F
|
||||
|
||||
|
||||
def masked_l2_loss(pred, target, mask, weight_known, weight_missing):
|
||||
per_pixel_l2 = F.mse_loss(pred, target, reduction='none')
|
||||
pixel_weights = mask * weight_missing + (1 - mask) * weight_known
|
||||
return (pixel_weights * per_pixel_l2).mean()
|
||||
|
||||
|
||||
def masked_l1_loss(pred, target, mask, weight_known, weight_missing):
|
||||
per_pixel_l1 = F.l1_loss(pred, target, reduction='none')
|
||||
pixel_weights = mask * weight_missing + (1 - mask) * weight_known
|
||||
return (pixel_weights * per_pixel_l1).mean()
|
||||
|
||||
|
||||
def feature_matching_loss(fake_features: List[torch.Tensor],
|
||||
target_features: List[torch.Tensor],
|
||||
mask=None):
|
||||
if mask is None:
|
||||
res = torch.stack([
|
||||
F.mse_loss(fake_feat, target_feat)
|
||||
for fake_feat, target_feat in zip(fake_features, target_features)
|
||||
]).mean()
|
||||
else:
|
||||
res = 0
|
||||
norm = 0
|
||||
for fake_feat, target_feat in zip(fake_features, target_features):
|
||||
cur_mask = F.interpolate(
|
||||
mask,
|
||||
size=fake_feat.shape[-2:],
|
||||
mode='bilinear',
|
||||
align_corners=False)
|
||||
error_weights = 1 - cur_mask
|
||||
cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean()
|
||||
res = res + cur_val
|
||||
norm += 1
|
||||
res = res / norm
|
||||
return res
|
||||
588
modelscope/models/cv/image_inpainting/modules/ffc.py
Normal file
588
modelscope/models/cv/image_inpainting/modules/ffc.py
Normal file
@@ -0,0 +1,588 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from kornia.geometry.transform import rotate
|
||||
|
||||
|
||||
def get_activation(kind='tanh'):
|
||||
if kind == 'tanh':
|
||||
return nn.Tanh()
|
||||
if kind == 'sigmoid':
|
||||
return nn.Sigmoid()
|
||||
if kind is False:
|
||||
return nn.Identity()
|
||||
raise ValueError(f'Unknown activation kind {kind}')
|
||||
|
||||
|
||||
class SELayer(nn.Module):
|
||||
|
||||
def __init__(self, channel, reduction=16):
|
||||
super(SELayer, self).__init__()
|
||||
self.avg_pool = nn.AdaptiveAvgPool2d(1)
|
||||
self.fc = nn.Sequential(
|
||||
nn.Linear(channel, channel // reduction, bias=False),
|
||||
nn.ReLU(inplace=True),
|
||||
nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid())
|
||||
|
||||
def forward(self, x):
|
||||
b, c, _, _ = x.size()
|
||||
y = self.avg_pool(x).view(b, c)
|
||||
y = self.fc(y).view(b, c, 1, 1)
|
||||
res = x * y.expand_as(x)
|
||||
return res
|
||||
|
||||
|
||||
class FourierUnit(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
groups=1,
|
||||
spatial_scale_factor=None,
|
||||
spatial_scale_mode='bilinear',
|
||||
spectral_pos_encoding=False,
|
||||
use_se=False,
|
||||
se_kwargs=None,
|
||||
ffc3d=False,
|
||||
fft_norm='ortho'):
|
||||
# bn_layer not used
|
||||
super(FourierUnit, self).__init__()
|
||||
self.groups = groups
|
||||
|
||||
self.conv_layer = torch.nn.Conv2d(
|
||||
in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0),
|
||||
out_channels=out_channels * 2,
|
||||
kernel_size=1,
|
||||
stride=1,
|
||||
padding=0,
|
||||
groups=self.groups,
|
||||
bias=False)
|
||||
self.bn = torch.nn.BatchNorm2d(out_channels * 2)
|
||||
self.relu = torch.nn.ReLU(inplace=True)
|
||||
|
||||
# squeeze and excitation block
|
||||
self.use_se = use_se
|
||||
if use_se:
|
||||
if se_kwargs is None:
|
||||
se_kwargs = {}
|
||||
self.se = SELayer(self.conv_layer.in_channels, **se_kwargs)
|
||||
|
||||
self.spatial_scale_factor = spatial_scale_factor
|
||||
self.spatial_scale_mode = spatial_scale_mode
|
||||
self.spectral_pos_encoding = spectral_pos_encoding
|
||||
self.ffc3d = ffc3d
|
||||
self.fft_norm = fft_norm
|
||||
|
||||
def forward(self, x):
|
||||
batch = x.shape[0]
|
||||
|
||||
if self.spatial_scale_factor is not None:
|
||||
orig_size = x.shape[-2:]
|
||||
x = F.interpolate(
|
||||
x,
|
||||
scale_factor=self.spatial_scale_factor,
|
||||
mode=self.spatial_scale_mode,
|
||||
align_corners=False)
|
||||
|
||||
# (batch, c, h, w/2+1, 2)
|
||||
fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1)
|
||||
ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm)
|
||||
ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
|
||||
ffted = ffted.permute(0, 1, 4, 2,
|
||||
3).contiguous() # (batch, c, 2, h, w/2+1)
|
||||
ffted = ffted.view((
|
||||
batch,
|
||||
-1,
|
||||
) + ffted.size()[3:])
|
||||
|
||||
if self.spectral_pos_encoding:
|
||||
height, width = ffted.shape[-2:]
|
||||
coords_vert = torch.linspace(0, 1,
|
||||
height)[None, None, :, None].expand(
|
||||
batch, 1, height, width).to(ffted)
|
||||
coords_hor = torch.linspace(0, 1,
|
||||
width)[None, None, None, :].expand(
|
||||
batch, 1, height, width).to(ffted)
|
||||
ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1)
|
||||
|
||||
if self.use_se:
|
||||
ffted = self.se(ffted)
|
||||
|
||||
ffted = self.conv_layer(ffted) # (batch, c*2, h, w/2+1)
|
||||
ffted = self.relu(self.bn(ffted))
|
||||
|
||||
ffted = ffted.view((
|
||||
batch,
|
||||
-1,
|
||||
2,
|
||||
) + ffted.size()[2:]).permute(
|
||||
0, 1, 3, 4, 2).contiguous() # (batch,c, t, h, w/2+1, 2)
|
||||
ffted = torch.complex(ffted[..., 0], ffted[..., 1])
|
||||
|
||||
ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:]
|
||||
output = torch.fft.irfftn(
|
||||
ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm)
|
||||
|
||||
if self.spatial_scale_factor is not None:
|
||||
output = F.interpolate(
|
||||
output,
|
||||
size=orig_size,
|
||||
mode=self.spatial_scale_mode,
|
||||
align_corners=False)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class SpectralTransform(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
stride=1,
|
||||
groups=1,
|
||||
enable_lfu=True,
|
||||
**fu_kwargs):
|
||||
# bn_layer not used
|
||||
super(SpectralTransform, self).__init__()
|
||||
self.enable_lfu = enable_lfu
|
||||
if stride == 2:
|
||||
self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
|
||||
else:
|
||||
self.downsample = nn.Identity()
|
||||
|
||||
self.stride = stride
|
||||
self.conv1 = nn.Sequential(
|
||||
nn.Conv2d(
|
||||
in_channels,
|
||||
out_channels // 2,
|
||||
kernel_size=1,
|
||||
groups=groups,
|
||||
bias=False), nn.BatchNorm2d(out_channels // 2),
|
||||
nn.ReLU(inplace=True))
|
||||
self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups,
|
||||
**fu_kwargs)
|
||||
if self.enable_lfu:
|
||||
self.lfu = FourierUnit(out_channels // 2, out_channels // 2,
|
||||
groups)
|
||||
self.conv2 = torch.nn.Conv2d(
|
||||
out_channels // 2,
|
||||
out_channels,
|
||||
kernel_size=1,
|
||||
groups=groups,
|
||||
bias=False)
|
||||
|
||||
def forward(self, x):
|
||||
|
||||
x = self.downsample(x)
|
||||
x = self.conv1(x)
|
||||
output = self.fu(x)
|
||||
|
||||
if self.enable_lfu:
|
||||
n, c, h, w = x.shape
|
||||
split_no = 2
|
||||
split_s = h // split_no
|
||||
xs = torch.cat(
|
||||
torch.split(x[:, :c // 4], split_s, dim=-2),
|
||||
dim=1).contiguous()
|
||||
xs = torch.cat(
|
||||
torch.split(xs, split_s, dim=-1), dim=1).contiguous()
|
||||
xs = self.lfu(xs)
|
||||
xs = xs.repeat(1, 1, split_no, split_no).contiguous()
|
||||
else:
|
||||
xs = 0
|
||||
|
||||
output = self.conv2(x + output + xs)
|
||||
|
||||
return output
|
||||
|
||||
|
||||
class LearnableSpatialTransformWrapper(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
impl,
|
||||
pad_coef=0.5,
|
||||
angle_init_range=80,
|
||||
train_angle=True):
|
||||
super().__init__()
|
||||
self.impl = impl
|
||||
self.angle = torch.rand(1) * angle_init_range
|
||||
if train_angle:
|
||||
self.angle = nn.Parameter(self.angle, requires_grad=True)
|
||||
self.pad_coef = pad_coef
|
||||
|
||||
def forward(self, x):
|
||||
if torch.is_tensor(x):
|
||||
return self.inverse_transform(self.impl(self.transform(x)), x)
|
||||
elif isinstance(x, tuple):
|
||||
x_trans = tuple(self.transform(elem) for elem in x)
|
||||
y_trans = self.impl(x_trans)
|
||||
return tuple(
|
||||
self.inverse_transform(elem, orig_x)
|
||||
for elem, orig_x in zip(y_trans, x))
|
||||
else:
|
||||
raise ValueError(f'Unexpected input type {type(x)}')
|
||||
|
||||
def transform(self, x):
|
||||
height, width = x.shape[2:]
|
||||
pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
|
||||
x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect')
|
||||
x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded))
|
||||
return x_padded_rotated
|
||||
|
||||
def inverse_transform(self, y_padded_rotated, orig_x):
|
||||
height, width = orig_x.shape[2:]
|
||||
pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
|
||||
|
||||
y_padded = rotate(
|
||||
y_padded_rotated, angle=-self.angle.to(y_padded_rotated))
|
||||
y_height, y_width = y_padded.shape[2:]
|
||||
y = y_padded[:, :, pad_h:y_height - pad_h, pad_w:y_width - pad_w]
|
||||
return y
|
||||
|
||||
|
||||
class FFC(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
ratio_gin,
|
||||
ratio_gout,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=False,
|
||||
enable_lfu=True,
|
||||
padding_type='reflect',
|
||||
gated=False,
|
||||
**spectral_kwargs):
|
||||
super(FFC, self).__init__()
|
||||
|
||||
assert stride == 1 or stride == 2, 'Stride should be 1 or 2.'
|
||||
self.stride = stride
|
||||
|
||||
in_cg = int(in_channels * ratio_gin)
|
||||
in_cl = in_channels - in_cg
|
||||
out_cg = int(out_channels * ratio_gout)
|
||||
out_cl = out_channels - out_cg
|
||||
|
||||
self.ratio_gin = ratio_gin
|
||||
self.ratio_gout = ratio_gout
|
||||
self.global_in_num = in_cg
|
||||
|
||||
module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
|
||||
self.convl2l = module(
|
||||
in_cl,
|
||||
out_cl,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
bias,
|
||||
padding_mode=padding_type)
|
||||
module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
|
||||
self.convl2g = module(
|
||||
in_cl,
|
||||
out_cg,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
bias,
|
||||
padding_mode=padding_type)
|
||||
module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
|
||||
self.convg2l = module(
|
||||
in_cg,
|
||||
out_cl,
|
||||
kernel_size,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
bias,
|
||||
padding_mode=padding_type)
|
||||
module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
|
||||
self.convg2g = module(in_cg, out_cg, stride,
|
||||
1 if groups == 1 else groups // 2, enable_lfu,
|
||||
**spectral_kwargs)
|
||||
|
||||
self.gated = gated
|
||||
module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
|
||||
self.gate = module(in_channels, 2, 1)
|
||||
|
||||
def forward(self, x):
|
||||
x_l, x_g = x if type(x) is tuple else (x, 0)
|
||||
out_xl, out_xg = 0, 0
|
||||
|
||||
if self.gated:
|
||||
total_input_parts = [x_l]
|
||||
if torch.is_tensor(x_g):
|
||||
total_input_parts.append(x_g)
|
||||
total_input = torch.cat(total_input_parts, dim=1)
|
||||
|
||||
gates = torch.sigmoid(self.gate(total_input))
|
||||
g2l_gate, l2g_gate = gates.chunk(2, dim=1)
|
||||
else:
|
||||
g2l_gate, l2g_gate = 1, 1
|
||||
|
||||
if self.ratio_gout != 1:
|
||||
out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
|
||||
if self.ratio_gout != 0:
|
||||
out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g)
|
||||
|
||||
return out_xl, out_xg
|
||||
|
||||
|
||||
class FFC_BN_ACT(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
ratio_gin,
|
||||
ratio_gout,
|
||||
stride=1,
|
||||
padding=0,
|
||||
dilation=1,
|
||||
groups=1,
|
||||
bias=False,
|
||||
norm_layer=nn.BatchNorm2d,
|
||||
activation_layer=nn.Identity,
|
||||
padding_type='reflect',
|
||||
enable_lfu=True,
|
||||
**kwargs):
|
||||
super(FFC_BN_ACT, self).__init__()
|
||||
self.ffc = FFC(
|
||||
in_channels,
|
||||
out_channels,
|
||||
kernel_size,
|
||||
ratio_gin,
|
||||
ratio_gout,
|
||||
stride,
|
||||
padding,
|
||||
dilation,
|
||||
groups,
|
||||
bias,
|
||||
enable_lfu,
|
||||
padding_type=padding_type,
|
||||
**kwargs)
|
||||
lnorm = nn.Identity if ratio_gout == 1 else norm_layer
|
||||
gnorm = nn.Identity if ratio_gout == 0 else norm_layer
|
||||
global_channels = int(out_channels * ratio_gout)
|
||||
self.bn_l = lnorm(out_channels - global_channels)
|
||||
self.bn_g = gnorm(global_channels)
|
||||
|
||||
lact = nn.Identity if ratio_gout == 1 else activation_layer
|
||||
gact = nn.Identity if ratio_gout == 0 else activation_layer
|
||||
self.act_l = lact(inplace=True)
|
||||
self.act_g = gact(inplace=True)
|
||||
|
||||
def forward(self, x):
|
||||
x_l, x_g = self.ffc(x)
|
||||
x_l = self.act_l(self.bn_l(x_l))
|
||||
x_g = self.act_g(self.bn_g(x_g))
|
||||
return x_l, x_g
|
||||
|
||||
|
||||
class FFCResnetBlock(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
dim,
|
||||
padding_type,
|
||||
norm_layer,
|
||||
activation_layer=nn.ReLU,
|
||||
dilation=1,
|
||||
spatial_transform_kwargs=None,
|
||||
inline=False,
|
||||
**conv_kwargs):
|
||||
super().__init__()
|
||||
self.conv1 = FFC_BN_ACT(
|
||||
dim,
|
||||
dim,
|
||||
kernel_size=3,
|
||||
padding=dilation,
|
||||
dilation=dilation,
|
||||
norm_layer=norm_layer,
|
||||
activation_layer=activation_layer,
|
||||
padding_type=padding_type,
|
||||
**conv_kwargs)
|
||||
self.conv2 = FFC_BN_ACT(
|
||||
dim,
|
||||
dim,
|
||||
kernel_size=3,
|
||||
padding=dilation,
|
||||
dilation=dilation,
|
||||
norm_layer=norm_layer,
|
||||
activation_layer=activation_layer,
|
||||
padding_type=padding_type,
|
||||
**conv_kwargs)
|
||||
if spatial_transform_kwargs is not None:
|
||||
self.conv1 = LearnableSpatialTransformWrapper(
|
||||
self.conv1, **spatial_transform_kwargs)
|
||||
self.conv2 = LearnableSpatialTransformWrapper(
|
||||
self.conv2, **spatial_transform_kwargs)
|
||||
self.inline = inline
|
||||
|
||||
def forward(self, x):
|
||||
if self.inline:
|
||||
x_l, x_g = x[:, :-self.conv1.ffc.
|
||||
global_in_num], x[:, -self.conv1.ffc.global_in_num:]
|
||||
else:
|
||||
x_l, x_g = x if type(x) is tuple else (x, 0)
|
||||
|
||||
id_l, id_g = x_l, x_g
|
||||
|
||||
x_l, x_g = self.conv1((x_l, x_g))
|
||||
x_l, x_g = self.conv2((x_l, x_g))
|
||||
|
||||
x_l, x_g = id_l + x_l, id_g + x_g
|
||||
out = x_l, x_g
|
||||
if self.inline:
|
||||
out = torch.cat(out, dim=1)
|
||||
return out
|
||||
|
||||
|
||||
class ConcatTupleLayer(nn.Module):
|
||||
|
||||
def forward(self, x):
|
||||
assert isinstance(x, tuple)
|
||||
x_l, x_g = x
|
||||
assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
|
||||
if not torch.is_tensor(x_g):
|
||||
return x_l
|
||||
return torch.cat(x, dim=1)
|
||||
|
||||
|
||||
class FFCResNetGenerator(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
input_nc=4,
|
||||
output_nc=3,
|
||||
ngf=64,
|
||||
n_downsampling=3,
|
||||
n_blocks=18,
|
||||
norm_layer=nn.BatchNorm2d,
|
||||
padding_type='reflect',
|
||||
activation_layer=nn.ReLU,
|
||||
up_norm_layer=nn.BatchNorm2d,
|
||||
up_activation=nn.ReLU(True),
|
||||
init_conv_kwargs={
|
||||
'ratio_gin': 0,
|
||||
'ratio_gout': 0,
|
||||
'enable_lfu': False
|
||||
},
|
||||
downsample_conv_kwargs={
|
||||
'ratio_gin': 0,
|
||||
'ratio_gout': 0,
|
||||
'enable_lfu': False
|
||||
},
|
||||
resnet_conv_kwargs={
|
||||
'ratio_gin': 0.75,
|
||||
'ratio_gout': 0.75,
|
||||
'enable_lfu': False
|
||||
},
|
||||
spatial_transform_layers=None,
|
||||
spatial_transform_kwargs={},
|
||||
add_out_act='sigmoid',
|
||||
max_features=1024,
|
||||
out_ffc=False,
|
||||
out_ffc_kwargs={}):
|
||||
assert (n_blocks >= 0)
|
||||
super().__init__()
|
||||
|
||||
model = [
|
||||
nn.ReflectionPad2d(3),
|
||||
FFC_BN_ACT(
|
||||
input_nc,
|
||||
ngf,
|
||||
kernel_size=7,
|
||||
padding=0,
|
||||
norm_layer=norm_layer,
|
||||
activation_layer=activation_layer,
|
||||
**init_conv_kwargs)
|
||||
]
|
||||
|
||||
# downsample
|
||||
for i in range(n_downsampling):
|
||||
mult = 2**i
|
||||
if i == n_downsampling - 1:
|
||||
cur_conv_kwargs = dict(downsample_conv_kwargs)
|
||||
cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get(
|
||||
'ratio_gin', 0)
|
||||
else:
|
||||
cur_conv_kwargs = downsample_conv_kwargs
|
||||
model += [
|
||||
FFC_BN_ACT(
|
||||
min(max_features, ngf * mult),
|
||||
min(max_features, ngf * mult * 2),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
norm_layer=norm_layer,
|
||||
activation_layer=activation_layer,
|
||||
**cur_conv_kwargs)
|
||||
]
|
||||
|
||||
mult = 2**n_downsampling
|
||||
feats_num_bottleneck = min(max_features, ngf * mult)
|
||||
|
||||
# resnet blocks
|
||||
for i in range(n_blocks):
|
||||
cur_resblock = FFCResnetBlock(
|
||||
feats_num_bottleneck,
|
||||
padding_type=padding_type,
|
||||
activation_layer=activation_layer,
|
||||
norm_layer=norm_layer,
|
||||
**resnet_conv_kwargs)
|
||||
if spatial_transform_layers is not None and i in spatial_transform_layers:
|
||||
cur_resblock = LearnableSpatialTransformWrapper(
|
||||
cur_resblock, **spatial_transform_kwargs)
|
||||
model += [cur_resblock]
|
||||
|
||||
model += [ConcatTupleLayer()]
|
||||
|
||||
# upsample
|
||||
for i in range(n_downsampling):
|
||||
mult = 2**(n_downsampling - i)
|
||||
model += [
|
||||
nn.ConvTranspose2d(
|
||||
min(max_features, ngf * mult),
|
||||
min(max_features, int(ngf * mult / 2)),
|
||||
kernel_size=3,
|
||||
stride=2,
|
||||
padding=1,
|
||||
output_padding=1),
|
||||
up_norm_layer(min(max_features, int(ngf * mult / 2))),
|
||||
up_activation
|
||||
]
|
||||
|
||||
if out_ffc:
|
||||
model += [
|
||||
FFCResnetBlock(
|
||||
ngf,
|
||||
padding_type=padding_type,
|
||||
activation_layer=activation_layer,
|
||||
norm_layer=norm_layer,
|
||||
inline=True,
|
||||
**out_ffc_kwargs)
|
||||
]
|
||||
|
||||
model += [
|
||||
nn.ReflectionPad2d(3),
|
||||
nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)
|
||||
]
|
||||
if add_out_act:
|
||||
model.append(
|
||||
get_activation('tanh' if add_out_act is True else add_out_act))
|
||||
self.model = nn.Sequential(*model)
|
||||
|
||||
def forward(self, input):
|
||||
return self.model(input)
|
||||
324
modelscope/models/cv/image_inpainting/modules/inception.py
Normal file
324
modelscope/models/cv/image_inpainting/modules/inception.py
Normal file
@@ -0,0 +1,324 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
from torchvision import models
|
||||
|
||||
from modelscope.utils.logger import get_logger
|
||||
|
||||
try:
|
||||
from torchvision.models.utils import load_state_dict_from_url
|
||||
except ImportError:
|
||||
from torch.utils.model_zoo import load_url as load_state_dict_from_url
|
||||
|
||||
# Inception weights ported to Pytorch from
|
||||
# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
|
||||
FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/' \
|
||||
'fid_weights/pt_inception-2015-12-05-6726825d.pth'
|
||||
|
||||
LOGGER = get_logger()
|
||||
|
||||
|
||||
class InceptionV3(nn.Module):
|
||||
"""Pretrained InceptionV3 network returning feature maps"""
|
||||
|
||||
# Index of default block of inception to return,
|
||||
# corresponds to output of final average pooling
|
||||
DEFAULT_BLOCK_INDEX = 3
|
||||
|
||||
# Maps feature dimensionality to their output blocks indices
|
||||
BLOCK_INDEX_BY_DIM = {
|
||||
64: 0, # First max pooling features
|
||||
192: 1, # Second max pooling featurs
|
||||
768: 2, # Pre-aux classifier features
|
||||
2048: 3 # Final average pooling features
|
||||
}
|
||||
|
||||
def __init__(self,
|
||||
output_blocks=[DEFAULT_BLOCK_INDEX],
|
||||
resize_input=True,
|
||||
normalize_input=True,
|
||||
requires_grad=False,
|
||||
use_fid_inception=True):
|
||||
"""Build pretrained InceptionV3
|
||||
|
||||
Parameters
|
||||
----------
|
||||
output_blocks : list of int
|
||||
Indices of blocks to return features of. Possible values are:
|
||||
- 0: corresponds to output of first max pooling
|
||||
- 1: corresponds to output of second max pooling
|
||||
- 2: corresponds to output which is fed to aux classifier
|
||||
- 3: corresponds to output of final average pooling
|
||||
resize_input : bool
|
||||
If true, bilinearly resizes input to width and height 299 before
|
||||
feeding input to model. As the network without fully connected
|
||||
layers is fully convolutional, it should be able to handle inputs
|
||||
of arbitrary size, so resizing might not be strictly needed
|
||||
normalize_input : bool
|
||||
If true, scales the input from range (0, 1) to the range the
|
||||
pretrained Inception network expects, namely (-1, 1)
|
||||
requires_grad : bool
|
||||
If true, parameters of the model require gradients. Possibly useful
|
||||
for finetuning the network
|
||||
use_fid_inception : bool
|
||||
If true, uses the pretrained Inception model used in Tensorflow's
|
||||
FID implementation. If false, uses the pretrained Inception model
|
||||
available in torchvision. The FID Inception model has different
|
||||
weights and a slightly different structure from torchvision's
|
||||
Inception model. If you want to compute FID scores, you are
|
||||
strongly advised to set this parameter to true to get comparable
|
||||
results.
|
||||
"""
|
||||
super(InceptionV3, self).__init__()
|
||||
|
||||
self.resize_input = resize_input
|
||||
self.normalize_input = normalize_input
|
||||
self.output_blocks = sorted(output_blocks)
|
||||
self.last_needed_block = max(output_blocks)
|
||||
|
||||
assert self.last_needed_block <= 3, \
|
||||
'Last possible output block index is 3'
|
||||
|
||||
self.blocks = nn.ModuleList()
|
||||
|
||||
if use_fid_inception:
|
||||
inception = fid_inception_v3()
|
||||
else:
|
||||
inception = models.inception_v3(pretrained=True)
|
||||
|
||||
# Block 0: input to maxpool1
|
||||
block0 = [
|
||||
inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3,
|
||||
inception.Conv2d_2b_3x3,
|
||||
nn.MaxPool2d(kernel_size=3, stride=2)
|
||||
]
|
||||
self.blocks.append(nn.Sequential(*block0))
|
||||
|
||||
# Block 1: maxpool1 to maxpool2
|
||||
if self.last_needed_block >= 1:
|
||||
block1 = [
|
||||
inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3,
|
||||
nn.MaxPool2d(kernel_size=3, stride=2)
|
||||
]
|
||||
self.blocks.append(nn.Sequential(*block1))
|
||||
|
||||
# Block 2: maxpool2 to aux classifier
|
||||
if self.last_needed_block >= 2:
|
||||
block2 = [
|
||||
inception.Mixed_5b,
|
||||
inception.Mixed_5c,
|
||||
inception.Mixed_5d,
|
||||
inception.Mixed_6a,
|
||||
inception.Mixed_6b,
|
||||
inception.Mixed_6c,
|
||||
inception.Mixed_6d,
|
||||
inception.Mixed_6e,
|
||||
]
|
||||
self.blocks.append(nn.Sequential(*block2))
|
||||
|
||||
# Block 3: aux classifier to final avgpool
|
||||
if self.last_needed_block >= 3:
|
||||
block3 = [
|
||||
inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c,
|
||||
nn.AdaptiveAvgPool2d(output_size=(1, 1))
|
||||
]
|
||||
self.blocks.append(nn.Sequential(*block3))
|
||||
|
||||
for param in self.parameters():
|
||||
param.requires_grad = requires_grad
|
||||
|
||||
def forward(self, inp):
|
||||
"""Get Inception feature maps
|
||||
|
||||
Parameters
|
||||
----------
|
||||
inp : torch.autograd.Variable
|
||||
Input tensor of shape Bx3xHxW. Values are expected to be in
|
||||
range (0, 1)
|
||||
|
||||
Returns
|
||||
-------
|
||||
List of torch.autograd.Variable, corresponding to the selected output
|
||||
block, sorted ascending by index
|
||||
"""
|
||||
outp = []
|
||||
x = inp
|
||||
|
||||
if self.resize_input:
|
||||
x = F.interpolate(
|
||||
x, size=(299, 299), mode='bilinear', align_corners=False)
|
||||
|
||||
if self.normalize_input:
|
||||
x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
|
||||
|
||||
for idx, block in enumerate(self.blocks):
|
||||
x = block(x)
|
||||
if idx in self.output_blocks:
|
||||
outp.append(x)
|
||||
|
||||
if idx == self.last_needed_block:
|
||||
break
|
||||
|
||||
return outp
|
||||
|
||||
|
||||
def fid_inception_v3():
|
||||
"""Build pretrained Inception model for FID computation
|
||||
|
||||
The Inception model for FID computation uses a different set of weights
|
||||
and has a slightly different structure than torchvision's Inception.
|
||||
|
||||
This method first constructs torchvision's Inception and then patches the
|
||||
necessary parts that are different in the FID Inception model.
|
||||
"""
|
||||
LOGGER.info('fid_inception_v3 called')
|
||||
inception = models.inception_v3(
|
||||
num_classes=1008, aux_logits=False, pretrained=False)
|
||||
LOGGER.info('models.inception_v3 done')
|
||||
inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
|
||||
inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
|
||||
inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
|
||||
inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
|
||||
inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
|
||||
inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
|
||||
inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
|
||||
inception.Mixed_7b = FIDInceptionE_1(1280)
|
||||
inception.Mixed_7c = FIDInceptionE_2(2048)
|
||||
|
||||
LOGGER.info('fid_inception_v3 patching done')
|
||||
|
||||
state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
|
||||
LOGGER.info('fid_inception_v3 weights downloaded')
|
||||
|
||||
inception.load_state_dict(state_dict)
|
||||
LOGGER.info('fid_inception_v3 weights loaded into model')
|
||||
|
||||
return inception
|
||||
|
||||
|
||||
class FIDInceptionA(models.inception.InceptionA):
|
||||
"""InceptionA block patched for FID computation"""
|
||||
|
||||
def __init__(self, in_channels, pool_features):
|
||||
super(FIDInceptionA, self).__init__(in_channels, pool_features)
|
||||
|
||||
def forward(self, x):
|
||||
branch1x1 = self.branch1x1(x)
|
||||
|
||||
branch5x5 = self.branch5x5_1(x)
|
||||
branch5x5 = self.branch5x5_2(branch5x5)
|
||||
|
||||
branch3x3dbl = self.branch3x3dbl_1(x)
|
||||
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
|
||||
branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
|
||||
|
||||
# Patch: Tensorflow's average pool does not use the padded zero's in
|
||||
# its average calculation
|
||||
branch_pool = F.avg_pool2d(
|
||||
x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
|
||||
branch_pool = self.branch_pool(branch_pool)
|
||||
|
||||
outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
|
||||
return torch.cat(outputs, 1)
|
||||
|
||||
|
||||
class FIDInceptionC(models.inception.InceptionC):
|
||||
"""InceptionC block patched for FID computation"""
|
||||
|
||||
def __init__(self, in_channels, channels_7x7):
|
||||
super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
|
||||
|
||||
def forward(self, x):
|
||||
branch1x1 = self.branch1x1(x)
|
||||
|
||||
branch7x7 = self.branch7x7_1(x)
|
||||
branch7x7 = self.branch7x7_2(branch7x7)
|
||||
branch7x7 = self.branch7x7_3(branch7x7)
|
||||
|
||||
branch7x7dbl = self.branch7x7dbl_1(x)
|
||||
branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
|
||||
branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
|
||||
branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
|
||||
branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
|
||||
|
||||
# Patch: Tensorflow's average pool does not use the padded zero's in
|
||||
# its average calculation
|
||||
branch_pool = F.avg_pool2d(
|
||||
x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
|
||||
branch_pool = self.branch_pool(branch_pool)
|
||||
|
||||
outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
|
||||
return torch.cat(outputs, 1)
|
||||
|
||||
|
||||
class FIDInceptionE_1(models.inception.InceptionE):
|
||||
"""First InceptionE block patched for FID computation"""
|
||||
|
||||
def __init__(self, in_channels):
|
||||
super(FIDInceptionE_1, self).__init__(in_channels)
|
||||
|
||||
def forward(self, x):
|
||||
branch1x1 = self.branch1x1(x)
|
||||
|
||||
branch3x3 = self.branch3x3_1(x)
|
||||
branch3x3 = [
|
||||
self.branch3x3_2a(branch3x3),
|
||||
self.branch3x3_2b(branch3x3),
|
||||
]
|
||||
branch3x3 = torch.cat(branch3x3, 1)
|
||||
|
||||
branch3x3dbl = self.branch3x3dbl_1(x)
|
||||
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
|
||||
branch3x3dbl = [
|
||||
self.branch3x3dbl_3a(branch3x3dbl),
|
||||
self.branch3x3dbl_3b(branch3x3dbl),
|
||||
]
|
||||
branch3x3dbl = torch.cat(branch3x3dbl, 1)
|
||||
|
||||
# Patch: Tensorflow's average pool does not use the padded zero's in
|
||||
# its average calculation
|
||||
branch_pool = F.avg_pool2d(
|
||||
x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
|
||||
branch_pool = self.branch_pool(branch_pool)
|
||||
|
||||
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
|
||||
return torch.cat(outputs, 1)
|
||||
|
||||
|
||||
class FIDInceptionE_2(models.inception.InceptionE):
|
||||
"""Second InceptionE block patched for FID computation"""
|
||||
|
||||
def __init__(self, in_channels):
|
||||
super(FIDInceptionE_2, self).__init__(in_channels)
|
||||
|
||||
def forward(self, x):
|
||||
branch1x1 = self.branch1x1(x)
|
||||
|
||||
branch3x3 = self.branch3x3_1(x)
|
||||
branch3x3 = [
|
||||
self.branch3x3_2a(branch3x3),
|
||||
self.branch3x3_2b(branch3x3),
|
||||
]
|
||||
branch3x3 = torch.cat(branch3x3, 1)
|
||||
|
||||
branch3x3dbl = self.branch3x3dbl_1(x)
|
||||
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
|
||||
branch3x3dbl = [
|
||||
self.branch3x3dbl_3a(branch3x3dbl),
|
||||
self.branch3x3dbl_3b(branch3x3dbl),
|
||||
]
|
||||
branch3x3dbl = torch.cat(branch3x3dbl, 1)
|
||||
|
||||
# Patch: The FID Inception model uses max pooling instead of average
|
||||
# pooling. This is likely an error in this specific Inception
|
||||
# implementation, as other Inception models use average pooling here
|
||||
# (which matches the description in the paper).
|
||||
branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
|
||||
branch_pool = self.branch_pool(branch_pool)
|
||||
|
||||
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
|
||||
return torch.cat(outputs, 1)
|
||||
47
modelscope/models/cv/image_inpainting/modules/perceptual.py
Normal file
47
modelscope/models/cv/image_inpainting/modules/perceptual.py
Normal file
@@ -0,0 +1,47 @@
|
||||
"""
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
"""
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
import torch.nn.functional as F
|
||||
import torchvision
|
||||
|
||||
from .ade20k import ModelBuilder
|
||||
|
||||
IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None]
|
||||
IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None]
|
||||
|
||||
|
||||
class ResNetPL(nn.Module):
|
||||
|
||||
def __init__(self,
|
||||
weight=1,
|
||||
weights_path=None,
|
||||
arch_encoder='resnet50dilated',
|
||||
segmentation=True):
|
||||
super().__init__()
|
||||
self.impl = ModelBuilder.get_encoder(
|
||||
weights_path=weights_path,
|
||||
arch_encoder=arch_encoder,
|
||||
arch_decoder='ppm_deepsup',
|
||||
fc_dim=2048,
|
||||
segmentation=segmentation)
|
||||
self.impl.eval()
|
||||
for w in self.impl.parameters():
|
||||
w.requires_grad_(False)
|
||||
|
||||
self.weight = weight
|
||||
|
||||
def forward(self, pred, target):
|
||||
pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred)
|
||||
target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target)
|
||||
|
||||
pred_feats = self.impl(pred, return_feature_maps=True)
|
||||
target_feats = self.impl(target, return_feature_maps=True)
|
||||
|
||||
result = torch.stack([
|
||||
F.mse_loss(cur_pred, cur_target)
|
||||
for cur_pred, cur_target in zip(pred_feats, target_feats)
|
||||
]).sum() * self.weight
|
||||
return result
|
||||
75
modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
Normal file
75
modelscope/models/cv/image_inpainting/modules/pix2pixhd.py
Normal file
@@ -0,0 +1,75 @@
|
||||
"""
|
||||
The implementation is adopted from
|
||||
https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py
|
||||
"""
|
||||
import collections
|
||||
import functools
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
# Defines the PatchGAN discriminator with the specified arguments.
|
||||
class NLayerDiscriminator(nn.Module):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
input_nc=3,
|
||||
ndf=64,
|
||||
n_layers=4,
|
||||
norm_layer=nn.BatchNorm2d,
|
||||
):
|
||||
super().__init__()
|
||||
self.n_layers = n_layers
|
||||
|
||||
kw = 4
|
||||
padw = int(np.ceil((kw - 1.0) / 2))
|
||||
sequence = [[
|
||||
nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
|
||||
nn.LeakyReLU(0.2, True)
|
||||
]]
|
||||
|
||||
nf = ndf
|
||||
for n in range(1, n_layers):
|
||||
nf_prev = nf
|
||||
nf = min(nf * 2, 512)
|
||||
|
||||
cur_model = []
|
||||
cur_model += [
|
||||
nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw),
|
||||
norm_layer(nf),
|
||||
nn.LeakyReLU(0.2, True)
|
||||
]
|
||||
sequence.append(cur_model)
|
||||
|
||||
nf_prev = nf
|
||||
nf = min(nf * 2, 512)
|
||||
|
||||
cur_model = []
|
||||
cur_model += [
|
||||
nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
|
||||
norm_layer(nf),
|
||||
nn.LeakyReLU(0.2, True)
|
||||
]
|
||||
sequence.append(cur_model)
|
||||
|
||||
sequence += [[
|
||||
nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)
|
||||
]]
|
||||
|
||||
for n in range(len(sequence)):
|
||||
setattr(self, 'model' + str(n), nn.Sequential(*sequence[n]))
|
||||
|
||||
def get_all_activations(self, x):
|
||||
res = [x]
|
||||
for n in range(self.n_layers + 2):
|
||||
model = getattr(self, 'model' + str(n))
|
||||
res.append(model(res[-1]))
|
||||
return res[1:]
|
||||
|
||||
def forward(self, x):
|
||||
act = self.get_all_activations(x)
|
||||
return act[-1], act[:-1]
|
||||
393
modelscope/models/cv/image_inpainting/refinement.py
Normal file
393
modelscope/models/cv/image_inpainting/refinement.py
Normal file
@@ -0,0 +1,393 @@
|
||||
'''
|
||||
Part of the implementation is borrowed and modified from LaMa, publicly available at
|
||||
https://github.com/saic-mdal/lama
|
||||
'''
|
||||
import cv2
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from kornia.filters import gaussian_blur2d
|
||||
from kornia.geometry.transform import resize
|
||||
from kornia.morphology import erosion
|
||||
from torch.nn import functional as F
|
||||
from torch.optim import SGD, Adam
|
||||
from tqdm import tqdm
|
||||
|
||||
from .modules.ffc import FFCResnetBlock
|
||||
|
||||
|
||||
def move_to_device(obj, device):
|
||||
if isinstance(obj, nn.Module):
|
||||
return obj.to(device)
|
||||
if torch.is_tensor(obj):
|
||||
return obj.to(device)
|
||||
if isinstance(obj, (tuple, list)):
|
||||
return [move_to_device(el, device) for el in obj]
|
||||
if isinstance(obj, dict):
|
||||
return {name: move_to_device(val, device) for name, val in obj.items()}
|
||||
raise ValueError(f'Unexpected type {type(obj)}')
|
||||
|
||||
|
||||
def ceil_modulo(x, mod):
|
||||
if x % mod == 0:
|
||||
return x
|
||||
return (x // mod + 1) * mod
|
||||
|
||||
|
||||
def pad_tensor_to_modulo(img, mod):
|
||||
batch_size, channels, height, width = img.shape
|
||||
out_height = ceil_modulo(height, mod)
|
||||
out_width = ceil_modulo(width, mod)
|
||||
return F.pad(
|
||||
img,
|
||||
pad=(0, out_width - width, 0, out_height - height),
|
||||
mode='reflect')
|
||||
|
||||
|
||||
def _pyrdown(im: torch.Tensor, downsize: tuple = None):
|
||||
"""downscale the image"""
|
||||
if downsize is None:
|
||||
downsize = (im.shape[2] // 2, im.shape[3] // 2)
|
||||
assert im.shape[
|
||||
1] == 3, 'Expected shape for the input to be (n,3,height,width)'
|
||||
im = gaussian_blur2d(im, kernel_size=(5, 5), sigma=(1.0, 1.0))
|
||||
im = F.interpolate(im, size=downsize, mode='bilinear', align_corners=False)
|
||||
return im
|
||||
|
||||
|
||||
def _pyrdown_mask(mask: torch.Tensor,
|
||||
downsize: tuple = None,
|
||||
eps: float = 1e-8,
|
||||
blur_mask: bool = True,
|
||||
round_up: bool = True):
|
||||
"""downscale the mask tensor
|
||||
|
||||
Parameters
|
||||
----------
|
||||
mask : torch.Tensor
|
||||
mask of size (B, 1, H, W)
|
||||
downsize : tuple, optional
|
||||
size to downscale to. If None, image is downscaled to half, by default None
|
||||
eps : float, optional
|
||||
threshold value for binarizing the mask, by default 1e-8
|
||||
blur_mask : bool, optional
|
||||
if True, apply gaussian filter before downscaling, by default True
|
||||
round_up : bool, optional
|
||||
if True, values above eps are marked 1, else, values below 1-eps are marked 0, by default True
|
||||
|
||||
Returns
|
||||
-------
|
||||
torch.Tensor
|
||||
downscaled mask
|
||||
"""
|
||||
|
||||
if downsize is None:
|
||||
downsize = (mask.shape[2] // 2, mask.shape[3] // 2)
|
||||
assert mask.shape[
|
||||
1] == 1, 'Expected shape for the input to be (n,1,height,width)'
|
||||
if blur_mask is True:
|
||||
mask = gaussian_blur2d(mask, kernel_size=(5, 5), sigma=(1.0, 1.0))
|
||||
mask = F.interpolate(
|
||||
mask, size=downsize, mode='bilinear', align_corners=False)
|
||||
else:
|
||||
mask = F.interpolate(
|
||||
mask, size=downsize, mode='bilinear', align_corners=False)
|
||||
if round_up:
|
||||
mask[mask >= eps] = 1
|
||||
mask[mask < eps] = 0
|
||||
else:
|
||||
mask[mask >= 1.0 - eps] = 1
|
||||
mask[mask < 1.0 - eps] = 0
|
||||
return mask
|
||||
|
||||
|
||||
def _erode_mask(mask: torch.Tensor,
|
||||
ekernel: torch.Tensor = None,
|
||||
eps: float = 1e-8):
|
||||
"""erode the mask, and set gray pixels to 0"""
|
||||
if ekernel is not None:
|
||||
mask = erosion(mask, ekernel)
|
||||
mask[mask >= 1.0 - eps] = 1
|
||||
mask[mask < 1.0 - eps] = 0
|
||||
return mask
|
||||
|
||||
|
||||
def _l1_loss(pred: torch.Tensor,
|
||||
pred_downscaled: torch.Tensor,
|
||||
ref: torch.Tensor,
|
||||
mask: torch.Tensor,
|
||||
mask_downscaled: torch.Tensor,
|
||||
image: torch.Tensor,
|
||||
on_pred: bool = True):
|
||||
"""l1 loss on src pixels, and downscaled predictions if on_pred=True"""
|
||||
loss = torch.mean(torch.abs(pred[mask < 1e-8] - image[mask < 1e-8]))
|
||||
if on_pred:
|
||||
loss += torch.mean(
|
||||
torch.abs(pred_downscaled[mask_downscaled >= 1e-8]
|
||||
- ref[mask_downscaled >= 1e-8]))
|
||||
return loss
|
||||
|
||||
|
||||
def _infer(image: torch.Tensor,
|
||||
mask: torch.Tensor,
|
||||
forward_front: nn.Module,
|
||||
forward_rears: nn.Module,
|
||||
ref_lower_res: torch.Tensor,
|
||||
orig_shape: tuple,
|
||||
devices: list,
|
||||
scale_ind: int,
|
||||
n_iters: int = 15,
|
||||
lr: float = 0.002):
|
||||
"""Performs inference with refinement at a given scale.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
image : torch.Tensor
|
||||
input image to be inpainted, of size (1,3,H,W)
|
||||
mask : torch.Tensor
|
||||
input inpainting mask, of size (1,1,H,W)
|
||||
forward_front : nn.Module
|
||||
the front part of the inpainting network
|
||||
forward_rears : nn.Module
|
||||
the rear part of the inpainting network
|
||||
ref_lower_res : torch.Tensor
|
||||
the inpainting at previous scale, used as reference image
|
||||
orig_shape : tuple
|
||||
shape of the original input image before padding
|
||||
devices : list
|
||||
list of available devices
|
||||
scale_ind : int
|
||||
the scale index
|
||||
n_iters : int, optional
|
||||
number of iterations of refinement, by default 15
|
||||
lr : float, optional
|
||||
learning rate, by default 0.002
|
||||
|
||||
Returns
|
||||
-------
|
||||
torch.Tensor
|
||||
inpainted image
|
||||
"""
|
||||
masked_image = image * (1 - mask)
|
||||
masked_image = torch.cat([masked_image, mask], dim=1)
|
||||
|
||||
mask = mask.repeat(1, 3, 1, 1)
|
||||
if ref_lower_res is not None:
|
||||
ref_lower_res = ref_lower_res.detach()
|
||||
with torch.no_grad():
|
||||
z1, z2 = forward_front(masked_image)
|
||||
# Inference
|
||||
mask = mask.to(devices[-1])
|
||||
ekernel = torch.from_numpy(
|
||||
cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
|
||||
(15, 15)).astype(bool)).float()
|
||||
ekernel = ekernel.to(devices[-1])
|
||||
image = image.to(devices[-1])
|
||||
z1, z2 = z1.detach().to(devices[0]), z2.detach().to(devices[0])
|
||||
z1.requires_grad, z2.requires_grad = True, True
|
||||
|
||||
optimizer = Adam([z1, z2], lr=lr)
|
||||
|
||||
pbar = tqdm(range(n_iters), leave=False)
|
||||
for idi in pbar:
|
||||
optimizer.zero_grad()
|
||||
input_feat = (z1, z2)
|
||||
for idd, forward_rear in enumerate(forward_rears):
|
||||
output_feat = forward_rear(input_feat)
|
||||
if idd < len(devices) - 1:
|
||||
midz1, midz2 = output_feat
|
||||
midz1, midz2 = midz1.to(devices[idd + 1]), midz2.to(
|
||||
devices[idd + 1])
|
||||
input_feat = (midz1, midz2)
|
||||
else:
|
||||
pred = output_feat
|
||||
|
||||
if ref_lower_res is None:
|
||||
break
|
||||
losses = {}
|
||||
# scaled loss with downsampler
|
||||
pred_downscaled = _pyrdown(pred[:, :, :orig_shape[0], :orig_shape[1]])
|
||||
mask_downscaled = _pyrdown_mask(
|
||||
mask[:, :1, :orig_shape[0], :orig_shape[1]],
|
||||
blur_mask=False,
|
||||
round_up=False)
|
||||
mask_downscaled = _erode_mask(mask_downscaled, ekernel=ekernel)
|
||||
mask_downscaled = mask_downscaled.repeat(1, 3, 1, 1)
|
||||
losses['ms_l1'] = _l1_loss(
|
||||
pred,
|
||||
pred_downscaled,
|
||||
ref_lower_res,
|
||||
mask,
|
||||
mask_downscaled,
|
||||
image,
|
||||
on_pred=True)
|
||||
|
||||
loss = sum(losses.values())
|
||||
pbar.set_description(
|
||||
'Refining scale {} using scale {} ...current loss: {:.4f}'.format(
|
||||
scale_ind + 1, scale_ind, loss.item()))
|
||||
if idi < n_iters - 1:
|
||||
loss.backward()
|
||||
optimizer.step()
|
||||
del pred_downscaled
|
||||
del loss
|
||||
del pred
|
||||
# "pred" is the prediction after Plug-n-Play module
|
||||
inpainted = mask * pred + (1 - mask) * image
|
||||
inpainted = inpainted.detach().cpu()
|
||||
return inpainted
|
||||
|
||||
|
||||
def _get_image_mask_pyramid(batch: dict, min_side: int, max_scales: int,
|
||||
px_budget: int):
|
||||
"""Build the image mask pyramid
|
||||
|
||||
Parameters
|
||||
----------
|
||||
batch : dict
|
||||
batch containing image, mask, etc
|
||||
min_side : int
|
||||
minimum side length to limit the number of scales of the pyramid
|
||||
max_scales : int
|
||||
maximum number of scales allowed
|
||||
px_budget : int
|
||||
the product H*W cannot exceed this budget, because of resource constraints
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple
|
||||
image-mask pyramid in the form of list of images and list of masks
|
||||
"""
|
||||
|
||||
assert batch['image'].shape[
|
||||
0] == 1, 'refiner works on only batches of size 1!'
|
||||
|
||||
h, w = batch['unpad_to_size']
|
||||
h, w = h[0].item(), w[0].item()
|
||||
|
||||
image = batch['image'][..., :h, :w]
|
||||
mask = batch['mask'][..., :h, :w]
|
||||
if h * w > px_budget:
|
||||
# resize
|
||||
ratio = np.sqrt(px_budget / float(h * w))
|
||||
h_orig, w_orig = h, w
|
||||
h, w = int(h * ratio), int(w * ratio)
|
||||
print(
|
||||
f'Original image too large for refinement! Resizing {(h_orig,w_orig)} to {(h,w)}...'
|
||||
)
|
||||
image = resize(
|
||||
image, (h, w), interpolation='bilinear', align_corners=False)
|
||||
mask = resize(
|
||||
mask, (h, w), interpolation='bilinear', align_corners=False)
|
||||
mask[mask > 1e-8] = 1
|
||||
breadth = min(h, w)
|
||||
n_scales = min(1 + int(round(max(0, np.log2(breadth / min_side)))),
|
||||
max_scales)
|
||||
ls_images = []
|
||||
ls_masks = []
|
||||
|
||||
ls_images.append(image)
|
||||
ls_masks.append(mask)
|
||||
|
||||
for _ in range(n_scales - 1):
|
||||
image_p = _pyrdown(ls_images[-1])
|
||||
mask_p = _pyrdown_mask(ls_masks[-1])
|
||||
ls_images.append(image_p)
|
||||
ls_masks.append(mask_p)
|
||||
# reverse the lists because we want the lowest resolution image as index 0
|
||||
return ls_images[::-1], ls_masks[::-1]
|
||||
|
||||
|
||||
def refine_predict(batch: dict, inpainter: nn.Module, gpu_ids: str,
|
||||
modulo: int, n_iters: int, lr: float, min_side: int,
|
||||
max_scales: int, px_budget: int):
|
||||
"""Refines the inpainting of the network
|
||||
|
||||
Parameters
|
||||
----------
|
||||
batch : dict
|
||||
image-mask batch, currently we assume the batchsize to be 1
|
||||
inpainter : nn.Module
|
||||
the inpainting neural network
|
||||
gpu_ids : str
|
||||
the GPU ids of the machine to use. If only single GPU, use: "0,"
|
||||
modulo : int
|
||||
pad the image to ensure dimension % modulo == 0
|
||||
n_iters : int
|
||||
number of iterations of refinement for each scale
|
||||
lr : float
|
||||
learning rate
|
||||
min_side : int
|
||||
all sides of image on all scales should be >= min_side / sqrt(2)
|
||||
max_scales : int
|
||||
max number of downscaling scales for the image-mask pyramid
|
||||
px_budget : int
|
||||
pixels budget. Any image will be resized to satisfy height*width <= px_budget
|
||||
|
||||
Returns
|
||||
-------
|
||||
torch.Tensor
|
||||
inpainted image of size (1,3,H,W)
|
||||
"""
|
||||
inpainter = inpainter.model
|
||||
assert not inpainter.training
|
||||
assert not inpainter.add_noise_kwargs
|
||||
assert inpainter.concat_mask
|
||||
|
||||
gpu_ids = [
|
||||
f'cuda:{gpuid}' for gpuid in gpu_ids.replace(' ', '').split(',')
|
||||
if gpuid.isdigit()
|
||||
]
|
||||
n_resnet_blocks = 0
|
||||
first_resblock_ind = 0
|
||||
found_first_resblock = False
|
||||
for idl in range(len(inpainter.generator.model)):
|
||||
if isinstance(inpainter.generator.model[idl], FFCResnetBlock):
|
||||
n_resnet_blocks += 1
|
||||
found_first_resblock = True
|
||||
elif not found_first_resblock:
|
||||
first_resblock_ind += 1
|
||||
resblocks_per_gpu = n_resnet_blocks // len(gpu_ids)
|
||||
|
||||
devices = [torch.device(gpu_id) for gpu_id in gpu_ids]
|
||||
|
||||
# split the model into front, and rear parts
|
||||
forward_front = inpainter.generator.model[0:first_resblock_ind]
|
||||
forward_front.to(devices[0])
|
||||
forward_rears = []
|
||||
for idd in range(len(gpu_ids)):
|
||||
if idd < len(gpu_ids) - 1:
|
||||
forward_rears.append(
|
||||
inpainter.generator.model[first_resblock_ind
|
||||
+ resblocks_per_gpu
|
||||
* (idd):first_resblock_ind
|
||||
+ resblocks_per_gpu * (idd + 1)])
|
||||
else:
|
||||
forward_rears.append(
|
||||
inpainter.generator.model[first_resblock_ind
|
||||
+ resblocks_per_gpu * (idd):])
|
||||
forward_rears[idd].to(devices[idd])
|
||||
|
||||
ls_images, ls_masks = _get_image_mask_pyramid(batch, min_side, max_scales,
|
||||
px_budget)
|
||||
image_inpainted = None
|
||||
|
||||
for ids, (image, mask) in enumerate(zip(ls_images, ls_masks)):
|
||||
orig_shape = image.shape[2:]
|
||||
image = pad_tensor_to_modulo(image, modulo)
|
||||
mask = pad_tensor_to_modulo(mask, modulo)
|
||||
mask[mask >= 1e-8] = 1.0
|
||||
mask[mask < 1e-8] = 0.0
|
||||
image, mask = move_to_device(image, devices[0]), move_to_device(
|
||||
mask, devices[0])
|
||||
if image_inpainted is not None:
|
||||
image_inpainted = move_to_device(image_inpainted, devices[-1])
|
||||
image_inpainted = _infer(image, mask, forward_front, forward_rears,
|
||||
image_inpainted, orig_shape, devices, ids,
|
||||
n_iters, lr)
|
||||
image_inpainted = image_inpainted[:, :, :orig_shape[0], :orig_shape[1]]
|
||||
# detach everything to save resources
|
||||
image = image.detach().cpu()
|
||||
mask = mask.detach().cpu()
|
||||
|
||||
return image_inpainted
|
||||
@@ -10,7 +10,7 @@ if TYPE_CHECKING:
|
||||
else:
|
||||
_import_structure = {
|
||||
'mmdet_model': ['DetectionModel'],
|
||||
'yolox_pai': ['YOLOX']
|
||||
'yolox_pai': ['YOLOX'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -9,6 +9,9 @@ from modelscope.utils.constant import Tasks
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.image_object_detection, module_name=Models.yolox)
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.image_object_detection,
|
||||
module_name=Models.image_object_detection_auto)
|
||||
class YOLOX(EasyCVBaseModel, _YOLOX):
|
||||
|
||||
def __init__(self, model_dir=None, *args, **kwargs):
|
||||
|
||||
@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .realtime_detector import RealtimeDetector
|
||||
from .realtime_video_detector import RealtimeVideoDetector
|
||||
else:
|
||||
_import_structure = {
|
||||
'realtime_detector': ['RealtimeDetector'],
|
||||
'realtime_video_detector': ['RealtimeVideoDetector'],
|
||||
}
|
||||
|
||||
import sys
|
||||
|
||||
@@ -0,0 +1,117 @@
|
||||
# Copyright (c) Alibaba, Inc. and its affiliates.
|
||||
import argparse
|
||||
import logging as logger
|
||||
import os
|
||||
import os.path as osp
|
||||
import time
|
||||
|
||||
import cv2
|
||||
import json
|
||||
import torch
|
||||
from tqdm import tqdm
|
||||
|
||||
from modelscope.metainfo import Models
|
||||
from modelscope.models.base.base_torch_model import TorchModel
|
||||
from modelscope.models.builder import MODELS
|
||||
from modelscope.preprocessors import LoadImage
|
||||
from modelscope.utils.config import Config
|
||||
from modelscope.utils.constant import ModelFile, Tasks
|
||||
from .yolox.data.data_augment import ValTransform
|
||||
from .yolox.exp import get_exp_by_name
|
||||
from .yolox.utils import postprocess
|
||||
|
||||
|
||||
@MODELS.register_module(
|
||||
group_key=Tasks.video_object_detection,
|
||||
module_name=Models.realtime_video_object_detection)
|
||||
class RealtimeVideoDetector(TorchModel):
|
||||
|
||||
def __init__(self, model_dir: str, *args, **kwargs):
|
||||
super().__init__(model_dir, *args, **kwargs)
|
||||
self.config = Config.from_file(
|
||||
os.path.join(self.model_dir, ModelFile.CONFIGURATION))
|
||||
|
||||
# model type
|
||||
self.exp = get_exp_by_name(self.config.model_type)
|
||||
|
||||
# build model
|
||||
self.model = self.exp.get_model()
|
||||
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
|
||||
ckpt = torch.load(model_path, map_location='cpu')
|
||||
|
||||
# load the model state dict
|
||||
self.model.load_state_dict(ckpt['model'])
|
||||
self.model.eval()
|
||||
|
||||
# params setting
|
||||
self.exp.num_classes = self.config.num_classes
|
||||
self.confthre = self.config.conf_thr
|
||||
self.num_classes = self.exp.num_classes
|
||||
self.nmsthre = self.exp.nmsthre
|
||||
self.test_size = self.exp.test_size
|
||||
self.preproc = ValTransform(legacy=False)
|
||||
self.current_buffer = None
|
||||
self.label_mapping = self.config['labels']
|
||||
|
||||
def inference(self, img):
|
||||
with torch.no_grad():
|
||||
outputs, self.current_buffer = self.model(
|
||||
img, buffer=self.current_buffer, mode='on_pipe')
|
||||
return outputs
|
||||
|
||||
def forward(self, inputs):
|
||||
return self.inference_video(inputs)
|
||||
|
||||
def preprocess(self, img):
|
||||
img = LoadImage.convert_to_ndarray(img)
|
||||
height, width = img.shape[:2]
|
||||
self.ratio = min(self.test_size[0] / img.shape[0],
|
||||
self.test_size[1] / img.shape[1])
|
||||
|
||||
img, _ = self.preproc(img, None, self.test_size)
|
||||
img = torch.from_numpy(img).unsqueeze(0)
|
||||
img = img.float()
|
||||
|
||||
# Video decoding and preprocessing automatically are not supported by Pipeline/Model
|
||||
# Sending preprocessed video frame tensor to GPU buffer self-adaptively
|
||||
if next(self.model.parameters()).is_cuda:
|
||||
img = img.to(next(self.model.parameters()).device)
|
||||
return img
|
||||
|
||||
def postprocess(self, input):
|
||||
outputs = postprocess(
|
||||
input,
|
||||
self.num_classes,
|
||||
self.confthre,
|
||||
self.nmsthre,
|
||||
class_agnostic=True)
|
||||
|
||||
if len(outputs) == 1:
|
||||
bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
|
||||
scores = outputs[0][:, 5].cpu().numpy()
|
||||
labels = outputs[0][:, 6].cpu().int().numpy()
|
||||
pred_label_names = []
|
||||
for lab in labels:
|
||||
pred_label_names.append(self.label_mapping[lab])
|
||||
|
||||
return bboxes, scores, pred_label_names
|
||||
|
||||
def inference_video(self, v_path):
|
||||
outputs = []
|
||||
desc = 'Detecting video: {}'.format(v_path)
|
||||
for frame, result in tqdm(
|
||||
self.inference_video_iter(v_path), desc=desc):
|
||||
outputs.append(result)
|
||||
|
||||
return outputs
|
||||
|
||||
def inference_video_iter(self, v_path):
|
||||
capture = cv2.VideoCapture(v_path)
|
||||
while capture.isOpened():
|
||||
ret, frame = capture.read()
|
||||
if not ret:
|
||||
break
|
||||
output = self.preprocess(frame)
|
||||
output = self.inference(output)
|
||||
output = self.postprocess(output)
|
||||
yield frame, output
|
||||
@@ -13,6 +13,8 @@ def get_exp_by_name(exp_name):
|
||||
from .default import YoloXNanoExp as YoloXExp
|
||||
elif exp == 'yolox_tiny':
|
||||
from .default import YoloXTinyExp as YoloXExp
|
||||
elif exp == 'streamyolo':
|
||||
from .default import StreamYoloExp as YoloXExp
|
||||
else:
|
||||
pass
|
||||
return YoloXExp()
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
|
||||
|
||||
from .streamyolo import StreamYoloExp
|
||||
from .yolox_nano import YoloXNanoExp
|
||||
from .yolox_s import YoloXSExp
|
||||
from .yolox_tiny import YoloXTinyExp
|
||||
|
||||
@@ -0,0 +1,43 @@
|
||||
# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
|
||||
import os
|
||||
import sys
|
||||
|
||||
import torch
|
||||
|
||||
from ..yolox_base import Exp as YoloXExp
|
||||
|
||||
|
||||
class StreamYoloExp(YoloXExp):
|
||||
|
||||
def __init__(self):
|
||||
super(YoloXExp, self).__init__()
|
||||
self.depth = 1.0
|
||||
self.width = 1.0
|
||||
self.num_classes = 8
|
||||
self.test_size = (600, 960)
|
||||
self.test_conf = 0.3
|
||||
self.nmsthre = 0.65
|
||||
|
||||
def get_model(self):
|
||||
from ...models import StreamYOLO, DFPPAFPN, TALHead
|
||||
|
||||
def init_yolo(M):
|
||||
for m in M.modules():
|
||||
if isinstance(m, nn.BatchNorm2d):
|
||||
m.eps = 1e-3
|
||||
m.momentum = 0.03
|
||||
|
||||
if getattr(self, 'model', None) is None:
|
||||
in_channels = [256, 512, 1024]
|
||||
backbone = DFPPAFPN(
|
||||
self.depth, self.width, in_channels=in_channels)
|
||||
head = TALHead(
|
||||
self.num_classes,
|
||||
self.width,
|
||||
in_channels=in_channels,
|
||||
gamma=1.0,
|
||||
ignore_thr=0.5,
|
||||
ignore_value=1.6)
|
||||
self.model = StreamYOLO(backbone, head)
|
||||
|
||||
return self.model
|
||||
@@ -1,5 +1,4 @@
|
||||
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
|
||||
|
||||
import os
|
||||
import random
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user