Merge remote-tracking branch 'origin/master' into ofa/finetune

# Conflicts:
#	modelscope/metrics/__init__.py
This commit is contained in:
行嗔
2022-10-18 16:41:21 +08:00
267 changed files with 16051 additions and 1215 deletions

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:e8d653a9a1ee49789c3df38e8da96af7118e0d8336d6ed12cd6458efa015071d
size 2327764

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:c589d77404ea17d4d24daeb8624dce7e1ac919dc75e6bed44ea9d116f0514150
size 68524

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:76bf84536edbaf192a8a699efc62ba2b06056bac12c426ecfcc2e003d91fbd32
size 53219

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:ecbc9d0827cfb92e93e7d75868b1724142685dc20d3b32023c3c657a7b688a9c
size 254845

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:d510ab26ddc58ffea882c8ef850c1f9bd4444772f2bce7ebea3e76944536c3ae
size 48909

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:b2c1119e3d521cf2e583b1e85fc9c9afd1d44954b433135039a98050a730932d
size 1127557

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:46db348eae61448f1668ce282caec21375e96c3268d53da44aa67ec32cbf4fa5
size 2747938

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:709c1828ed2d56badf2f19a40194da9a5e5e6db2fb73ef55d047407f49bc7a15
size 27616

View File

@@ -1,3 +0,0 @@
version https://git-lfs.github.com/spec/v1
oid sha256:379e11d7fc3734d3ec95afd0d86460b4653fbf4bb1f57f993610d6a6fd30fd3d
size 1702339

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:dec0fbb931cb609bf481e56b89cd2fbbab79839f22832c3bbe69a8fae2769cdd
size 167407

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:9103ce2bc89212f67fb49ce70783b7667e376900d0f70fb8f5c4432eb74bc572
size 60801
oid sha256:33ecc221513559a042ff975a38cc16aa47674545bc349362722c774c83f8d90c
size 61239

View File

@@ -1,3 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:2d4dee34c7e83b77db04fb2f0d1200bfd37c7c24954c58e185da5cb96445975c
size 60801
oid sha256:803c2e3ff7688abf0f83702b3904830a9f6f71e41e252de3c559354a9effefd1
size 61115

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:a49c9bc74a60860c360a4bf4509fe9db915279aaabd953f354f2c38e9be1e6cb
size 2924691

View File

@@ -0,0 +1,3 @@
version https://git-lfs.github.com/spec/v1
oid sha256:f58df1d25590c158ae0a04b3999bd44b610cdaddb17d78afd84c34b3f00d4e87
size 4068783

View File

@@ -76,7 +76,7 @@ RUN pip install --no-cache-dir --upgrade pip && \
ENV SHELL=/bin/bash
# install special package
RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq
RUN pip install --no-cache-dir mmcls>=0.21.0 mmdet>=2.25.0 decord>=0.6.0 datasets==2.1.0 numpy==1.18.5 ipykernel fairseq fasttext https://modelscope.oss-cn-beijing.aliyuncs.com/releases/dependencies/xtcocotools-1.12-cp37-cp37m-linux_x86_64.whl
RUN if [ "$USE_GPU" = "True" ] ; then \
pip install --no-cache-dir dgl-cu113 dglgo -f https://data.dgl.ai/wheels/repo.html; \

View File

@@ -24,20 +24,17 @@ from modelscope.utils.constant import (DEFAULT_DATASET_REVISION,
DownloadMode)
from modelscope.utils.logger import get_logger
from .errors import (InvalidParameter, NotExistError, RequestError,
datahub_raise_on_error, handle_http_response, is_ok,
raise_on_error)
from .utils.utils import (get_dataset_hub_endpoint, get_endpoint,
model_id_to_group_owner_name)
datahub_raise_on_error, handle_http_post_error,
handle_http_response, is_ok, raise_on_error)
from .utils.utils import get_endpoint, model_id_to_group_owner_name
logger = get_logger()
class HubApi:
def __init__(self, endpoint=None, dataset_endpoint=None):
def __init__(self, endpoint=None):
self.endpoint = endpoint if endpoint is not None else get_endpoint()
self.dataset_endpoint = dataset_endpoint if dataset_endpoint is not None else get_dataset_hub_endpoint(
)
def login(
self,
@@ -105,17 +102,15 @@ class HubApi:
path = f'{self.endpoint}/api/v1/models'
owner_or_group, name = model_id_to_group_owner_name(model_id)
r = requests.post(
path,
json={
'Path': owner_or_group,
'Name': name,
'ChineseName': chinese_name,
'Visibility': visibility, # server check
'License': license
},
cookies=cookies)
r.raise_for_status()
body = {
'Path': owner_or_group,
'Name': name,
'ChineseName': chinese_name,
'Visibility': visibility, # server check
'License': license
}
r = requests.post(path, json=body, cookies=cookies)
handle_http_post_error(r, path, body)
raise_on_error(r.json())
model_repo_url = f'{get_endpoint()}/{model_id}'
return model_repo_url
@@ -290,7 +285,7 @@ class HubApi:
return files
def list_datasets(self):
path = f'{self.dataset_endpoint}/api/v1/datasets'
path = f'{self.endpoint}/api/v1/datasets'
headers = None
params = {}
r = requests.get(path, params=params, headers=headers)
@@ -317,13 +312,13 @@ class HubApi:
cache_dir):
shutil.rmtree(cache_dir)
os.makedirs(cache_dir, exist_ok=True)
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
dataset_id = resp['Data']['Id']
dataset_type = resp['Data']['Type']
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
datahub_url = f'{self.endpoint}/api/v1/datasets/{dataset_id}/repo/tree?Revision={revision}'
r = requests.get(datahub_url)
resp = r.json()
datahub_raise_on_error(datahub_url, resp)
@@ -341,7 +336,7 @@ class HubApi:
file_path = file_info['Path']
extension = os.path.splitext(file_path)[-1]
if extension in dataset_meta_format:
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_path}'
r = requests.get(datahub_url)
r.raise_for_status()
@@ -365,7 +360,7 @@ class HubApi:
namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION):
if file_name.endswith('.csv'):
file_name = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
file_name = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/repo?' \
f'Revision={revision}&FilePath={file_name}'
return file_name
@@ -374,7 +369,7 @@ class HubApi:
dataset_name: str,
namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION):
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
f'ststoken?Revision={revision}'
return self.datahub_remote_call(datahub_url)
@@ -385,7 +380,7 @@ class HubApi:
namespace: str,
revision: Optional[str] = DEFAULT_DATASET_REVISION):
datahub_url = f'{self.dataset_endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
datahub_url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/' \
f'ststoken?Revision={revision}'
cookies = requests.utils.dict_from_cookiejar(cookies)
@@ -394,6 +389,19 @@ class HubApi:
raise_on_error(resp)
return resp['Data']
def list_oss_dataset_objects(self, dataset_name, namespace, max_limit,
is_recursive, is_filter_dir, revision,
cookies):
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/oss/tree/?' \
f'MaxLimit={max_limit}&Revision={revision}&Recursive={is_recursive}&FilterDir={is_filter_dir}'
cookies = requests.utils.dict_from_cookiejar(cookies)
resp = requests.get(url=url, cookies=cookies)
resp = resp.json()
raise_on_error(resp)
resp = resp['Data']
return resp
def on_dataset_download(self, dataset_name: str, namespace: str) -> None:
url = f'{self.endpoint}/api/v1/datasets/{namespace}/{dataset_name}/download/increase'
r = requests.post(url)

View File

@@ -4,6 +4,10 @@ from http import HTTPStatus
from requests.exceptions import HTTPError
from modelscope.utils.logger import get_logger
logger = get_logger()
class NotExistError(Exception):
pass
@@ -45,15 +49,24 @@ def is_ok(rsp):
return rsp['Code'] == HTTPStatus.OK and rsp['Success']
def handle_http_post_error(response, url, request_body):
try:
response.raise_for_status()
except HTTPError as error:
logger.error('Request %s with body: %s exception' %
(url, request_body))
raise error
def handle_http_response(response, logger, cookies, model_id):
try:
response.raise_for_status()
except HTTPError:
except HTTPError as error:
if cookies is None: # code in [403] and
logger.error(
f'Authentication token does not exist, failed to access model {model_id} which may not exist or may be \
private. Please login first.')
raise
raise error
def raise_on_error(rsp):

View File

@@ -1,6 +1,7 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
import re
import subprocess
from typing import List
from xmlrpc.client import Boolean
@@ -138,8 +139,8 @@ class GitCommandWrapper(metaclass=Singleton):
repo_base_dir, repo_name, user_name)
response = self._run_git_command(*config_user_name_args.split(' '))
logger.debug(response.stdout.decode('utf8'))
config_user_email_args = '-C %s/%s config user.name %s' % (
repo_base_dir, repo_name, user_name)
config_user_email_args = '-C %s/%s config user.email %s' % (
repo_base_dir, repo_name, user_email)
response = self._run_git_command(
*config_user_email_args.split(' '))
logger.debug(response.stdout.decode('utf8'))
@@ -177,6 +178,15 @@ class GitCommandWrapper(metaclass=Singleton):
cmds = ['-C', '%s' % repo_dir, 'checkout', '-b', revision]
return self._run_git_command(*cmds)
def get_remote_branches(self, repo_dir: str):
cmds = ['-C', '%s' % repo_dir, 'branch', '-r']
rsp = self._run_git_command(*cmds)
info = [
line.strip()
for line in rsp.stdout.decode('utf8').strip().split(os.linesep)
][1:]
return ['/'.join(line.split('/')[1:]) for line in info]
def pull(self, repo_dir: str):
cmds = ['-C', repo_dir, 'pull']
return self._run_git_command(*cmds)

117
modelscope/hub/upload.py Normal file
View File

@@ -0,0 +1,117 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import datetime
import os
import shutil
import tempfile
import uuid
from typing import Dict, Optional
from uuid import uuid4
from filelock import FileLock
from modelscope import __version__
from modelscope.hub.api import HubApi, ModelScopeConfig
from modelscope.hub.errors import InvalidParameter, NotLoginException
from modelscope.hub.git import GitCommandWrapper
from modelscope.hub.repository import Repository
from modelscope.utils.constant import DEFAULT_MODEL_REVISION, ModelFile
from modelscope.utils.logger import get_logger
logger = get_logger()
def upload_folder(model_id: str,
model_dir: str,
visibility: int = 0,
license: str = None,
chinese_name: Optional[str] = None,
commit_message: Optional[str] = None,
revision: Optional[str] = DEFAULT_MODEL_REVISION):
"""
Upload model from a given directory to given repository. A valid model directory
must contain a configuration.json file.
This function upload the files in given directory to given repository. If the
given repository is not exists in remote, it will automatically create it with
given visibility, license and chinese_name parameters. If the revision is also
not exists in remote repository, it will create a new branch for it.
This function must be called before calling HubApi's login with a valid token
which can be obtained from ModelScope's website.
Args:
model_id (`str`):
The model id to be uploaded, caller must have write permission for it.
model_dir(`str`):
The Absolute Path of the finetune result.
visibility(`int`, defaults to `0`):
Visibility of the new created model(1-private, 5-public). If the model is
not exists in ModelScope, this function will create a new model with this
visibility and this parameter is required. You can ignore this parameter
if you make sure the model's existence.
license(`str`, defaults to `None`):
License of the new created model(see License). If the model is not exists
in ModelScope, this function will create a new model with this license
and this parameter is required. You can ignore this parameter if you
make sure the model's existence.
chinese_name(`str`, *optional*, defaults to `None`):
chinese name of the new created model.
commit_message(`str`, *optional*, defaults to `None`):
commit message of the push request.
revision (`str`, *optional*, default to DEFAULT_MODEL_REVISION):
which branch to push. If the branch is not exists, It will create a new
branch and push to it.
"""
if model_id is None:
raise InvalidParameter('model_id cannot be empty!')
if model_dir is None:
raise InvalidParameter('model_dir cannot be empty!')
if not os.path.exists(model_dir) or os.path.isfile(model_dir):
raise InvalidParameter('model_dir must be a valid directory.')
cfg_file = os.path.join(model_dir, ModelFile.CONFIGURATION)
if not os.path.exists(cfg_file):
raise ValueError(f'{model_dir} must contain a configuration.json.')
cookies = ModelScopeConfig.get_cookies()
if cookies is None:
raise NotLoginException('Must login before upload!')
files_to_save = os.listdir(model_dir)
api = HubApi()
try:
api.get_model(model_id=model_id)
except Exception:
if visibility is None or license is None:
raise InvalidParameter(
'visibility and license cannot be empty if want to create new repo'
)
logger.info('Create new model %s' % model_id)
api.create_model(
model_id=model_id,
visibility=visibility,
license=license,
chinese_name=chinese_name)
tmp_dir = tempfile.mkdtemp()
git_wrapper = GitCommandWrapper()
try:
repo = Repository(model_dir=tmp_dir, clone_from=model_id)
branches = git_wrapper.get_remote_branches(tmp_dir)
if revision not in branches:
logger.info('Create new branch %s' % revision)
git_wrapper.new_branch(tmp_dir, revision)
git_wrapper.checkout(tmp_dir, revision)
for f in files_to_save:
if f[0] != '.':
src = os.path.join(model_dir, f)
if os.path.isdir(src):
shutil.copytree(src, os.path.join(tmp_dir, f))
else:
shutil.copy(src, tmp_dir)
if not commit_message:
date = datetime.datetime.now().strftime('%Y_%m_%d_%H_%M_%S')
commit_message = '[automsg] push model %s to hub at %s' % (
model_id, date)
repo.push(commit_message=commit_message, branch=revision)
except Exception:
raise
finally:
shutil.rmtree(tmp_dir, ignore_errors=True)

View File

@@ -4,8 +4,7 @@ import hashlib
import os
from typing import Optional
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DATA_ENDPOINT,
DEFAULT_MODELSCOPE_DOMAIN,
from modelscope.hub.constants import (DEFAULT_MODELSCOPE_DOMAIN,
DEFAULT_MODELSCOPE_GROUP,
MODEL_ID_SEPARATOR,
MODELSCOPE_URL_SCHEME)
@@ -44,11 +43,6 @@ def get_endpoint():
return MODELSCOPE_URL_SCHEME + modelscope_domain
def get_dataset_hub_endpoint():
return os.environ.get('HUB_DATASET_ENDPOINT',
DEFAULT_MODELSCOPE_DATA_ENDPOINT)
def compute_hash(file_path):
BUFFER_SIZE = 1024 * 64 # 64k buffer size
sha256_hash = hashlib.sha256()

View File

@@ -14,6 +14,7 @@ class Models(object):
# vision models
detection = 'detection'
realtime_object_detection = 'realtime-object-detection'
realtime_video_object_detection = 'realtime-video-object-detection'
scrfd = 'scrfd'
classification_model = 'ClassificationModel'
nafnet = 'nafnet'
@@ -27,11 +28,13 @@ class Models(object):
face_2d_keypoints = 'face-2d-keypoints'
panoptic_segmentation = 'swinL-panoptic-segmentation'
image_reid_person = 'passvitb'
image_inpainting = 'FFTInpainting'
video_summarization = 'pgl-video-summarization'
swinL_semantic_segmentation = 'swinL-semantic-segmentation'
vitadapter_semantic_segmentation = 'vitadapter-semantic-segmentation'
text_driven_segmentation = 'text-driven-segmentation'
resnet50_bert = 'resnet50-bert'
referring_video_object_segmentation = 'swinT-referring-video-object-segmentation'
fer = 'fer'
retinaface = 'retinaface'
shop_segmentation = 'shop-segmentation'
@@ -39,14 +42,18 @@ class Models(object):
mtcnn = 'mtcnn'
ulfd = 'ulfd'
video_inpainting = 'video-inpainting'
human_wholebody_keypoint = 'human-wholebody-keypoint'
hand_static = 'hand-static'
face_human_hand_detection = 'face-human-hand-detection'
face_emotion = 'face-emotion'
product_segmentation = 'product-segmentation'
image_body_reshaping = 'image-body-reshaping'
# EasyCV models
yolox = 'YOLOX'
segformer = 'Segformer'
hand_2d_keypoints = 'HRNet-Hand2D-Keypoints'
image_object_detection_auto = 'image-object-detection-auto'
# nlp models
bert = 'bert'
@@ -66,6 +73,7 @@ class Models(object):
gcnncrf = 'gcnn-crf'
bart = 'bart'
gpt3 = 'gpt3'
gpt_neo = 'gpt-neo'
plug = 'plug'
bert_for_ds = 'bert-for-document-segmentation'
ponet = 'ponet'
@@ -96,6 +104,7 @@ class TaskModels(object):
information_extraction = 'information-extraction'
fill_mask = 'fill-mask'
feature_extraction = 'feature-extraction'
text_generation = 'text-generation'
class Heads(object):
@@ -111,6 +120,8 @@ class Heads(object):
token_classification = 'token-classification'
# extraction
information_extraction = 'information-extraction'
# text gen
text_generation = 'text-generation'
class Pipelines(object):
@@ -144,6 +155,7 @@ class Pipelines(object):
salient_detection = 'u2net-salient-detection'
image_classification = 'image-classification'
face_detection = 'resnet-face-detection-scrfd10gkps'
card_detection = 'resnet-card-detection-scrfd34gkps'
ulfd_face_detection = 'manual-face-detection-ulfd'
facial_expression_recognition = 'vgg19-facial-expression-recognition-fer'
retina_face_detection = 'resnet50-face-detection-retinaface'
@@ -160,6 +172,7 @@ class Pipelines(object):
face_image_generation = 'gan-face-image-generation'
product_retrieval_embedding = 'resnet50-product-retrieval-embedding'
realtime_object_detection = 'cspnet_realtime-object-detection_yolox'
realtime_video_object_detection = 'cspnet_realtime-video-object-detection_streamyolo'
face_recognition = 'ir101-face-recognition-cfglint'
image_instance_segmentation = 'cascade-mask-rcnn-swin-image-instance-segmentation'
image2image_translation = 'image-to-image-translation'
@@ -168,6 +181,7 @@ class Pipelines(object):
ocr_recognition = 'convnextTiny-ocr-recognition'
image_portrait_enhancement = 'gpen-image-portrait-enhancement'
image_to_image_generation = 'image-to-image-generation'
image_object_detection_auto = 'yolox_image-object-detection-auto'
skin_retouching = 'unet-skin-retouching'
tinynas_classification = 'tinynas-classification'
tinynas_detection = 'tinynas-detection'
@@ -178,15 +192,19 @@ class Pipelines(object):
video_summarization = 'googlenet_pgl_video_summarization'
image_semantic_segmentation = 'image-semantic-segmentation'
image_reid_person = 'passvitb-image-reid-person'
image_inpainting = 'fft-inpainting'
text_driven_segmentation = 'text-driven-segmentation'
movie_scene_segmentation = 'resnet50-bert-movie-scene-segmentation'
shop_segmentation = 'shop-segmentation'
video_inpainting = 'video-inpainting'
human_wholebody_keypoint = 'hrnetw48_human-wholebody-keypoint_image'
pst_action_recognition = 'patchshift-action-recognition'
hand_static = 'hand-static'
face_human_hand_detection = 'face-human-hand-detection'
face_emotion = 'face-emotion'
product_segmentation = 'product-segmentation'
image_body_reshaping = 'flow-based-body-reshaping'
referring_video_object_segmentation = 'referring-video-object-segmentation'
# nlp tasks
automatic_post_editing = 'automatic-post-editing'
@@ -211,6 +229,7 @@ class Pipelines(object):
zero_shot_classification = 'zero-shot-classification'
text_error_correction = 'text-error-correction'
plug_generation = 'plug-generation'
gpt3_generation = 'gpt3-generation'
faq_question_answering = 'faq-question-answering'
conversational_text_to_sql = 'conversational-text-to-sql'
table_question_answering_pipeline = 'table-question-answering-pipeline'
@@ -219,6 +238,9 @@ class Pipelines(object):
relation_extraction = 'relation-extraction'
document_segmentation = 'document-segmentation'
feature_extraction = 'feature-extraction'
translation_en_to_de = 'translation_en_to_de' # keep it underscore
translation_en_to_ro = 'translation_en_to_ro' # keep it underscore
translation_en_to_fr = 'translation_en_to_fr' # keep it underscore
# audio tasks
sambert_hifigan_tts = 'sambert-hifigan-tts'
@@ -263,6 +285,9 @@ class Trainers(object):
image_portrait_enhancement = 'image-portrait-enhancement'
video_summarization = 'video-summarization'
movie_scene_segmentation = 'movie-scene-segmentation'
face_detection_scrfd = 'face-detection-scrfd'
card_detection_scrfd = 'card-detection-scrfd'
image_inpainting = 'image-inpainting'
# nlp trainers
bert_sentiment_analysis = 'bert-sentiment-analysis'
@@ -274,6 +299,7 @@ class Trainers(object):
# audio trainers
speech_frcrn_ans_cirm_16k = 'speech_frcrn_ans_cirm_16k'
speech_dfsmn_kws_char_farfield = 'speech_dfsmn_kws_char_farfield'
class Preprocessors(object):
@@ -302,6 +328,8 @@ class Preprocessors(object):
bert_seq_cls_tokenizer = 'bert-seq-cls-tokenizer'
text_gen_tokenizer = 'text-gen-tokenizer'
text2text_gen_preprocessor = 'text2text-gen-preprocessor'
text_gen_jieba_tokenizer = 'text-gen-jieba-tokenizer'
text2text_translate_preprocessor = 'text2text-translate-preprocessor'
token_cls_tokenizer = 'token-cls-tokenizer'
ner_tokenizer = 'ner-tokenizer'
nli_tokenizer = 'nli-tokenizer'
@@ -324,6 +352,7 @@ class Preprocessors(object):
re_tokenizer = 're-tokenizer'
document_segmentation = 'document-segmentation'
feature_extraction = 'feature-extraction'
sentence_piece = 'sentence-piece'
# audio preprocessor
linear_aec_fbank = 'linear-aec-fbank'
@@ -365,6 +394,8 @@ class Metrics(object):
video_summarization_metric = 'video-summarization-metric'
# metric for movie-scene-segmentation task
movie_scene_segmentation_metric = 'movie-scene-segmentation-metric'
# metric for inpainting task
image_inpainting_metric = 'image-inpainting-metric'
class Optimizers(object):
@@ -406,6 +437,9 @@ class Hooks(object):
IterTimerHook = 'IterTimerHook'
EvaluationHook = 'EvaluationHook'
# Compression
SparsityHook = 'SparsityHook'
class LR_Schedulers(object):
"""learning rate scheduler is defined here
@@ -421,6 +455,8 @@ class Datasets(object):
"""
ClsDataset = 'ClsDataset'
Face2dKeypointsDataset = 'Face2dKeypointsDataset'
HandCocoWholeBodyDataset = 'HandCocoWholeBodyDataset'
HumanWholeBodyKeypointDataset = 'HumanWholeBodyKeypointDataset'
SegDataset = 'SegDataset'
DetDataset = 'DetDataset'
DetImagesMixDataset = 'DetImagesMixDataset'

View File

@@ -19,6 +19,7 @@ if TYPE_CHECKING:
from .movie_scene_segmentation_metric import MovieSceneSegmentationMetric
from .accuracy_metric import AccuracyMetric
from .bleu_metric import BleuMetric
from .image_inpainting_metric import ImageInpaintingMetric
else:
_import_structure = {
@@ -36,6 +37,7 @@ else:
'token_classification_metric': ['TokenClassificationMetric'],
'video_summarization_metric': ['VideoSummarizationMetric'],
'movie_scene_segmentation_metric': ['MovieSceneSegmentationMetric'],
'image_inpainting_metric': ['ImageInpaintingMetric'],
'accuracy_metric': ['AccuracyMetric'],
'bleu_metric': ['BleuMetric'],
}

View File

@@ -35,6 +35,8 @@ class AudioNoiseMetric(Metric):
total_loss = avg_loss + avg_amp + avg_phase + avg_sisnr
return {
'total_loss': total_loss.item(),
'avg_sisnr': avg_sisnr.item(),
# model use opposite number of sisnr as a calculation shortcut.
# revert it in evaluation result
'avg_sisnr': -avg_sisnr.item(),
MetricKeys.AVERAGE_LOSS: avg_loss.item()
}

View File

@@ -18,6 +18,7 @@ class MetricKeys(object):
SSIM = 'ssim'
AVERAGE_LOSS = 'avg_loss'
FScore = 'fscore'
FID = 'fid'
BLEU_1 = 'bleu-1'
BLEU_4 = 'bleu-4'
ROUGE_1 = 'rouge-1'
@@ -39,6 +40,7 @@ task_default_metrics = {
Tasks.image_captioning: [Metrics.text_gen_metric],
Tasks.visual_question_answering: [Metrics.text_gen_metric],
Tasks.movie_scene_segmentation: [Metrics.movie_scene_segmentation_metric],
Tasks.image_inpainting: [Metrics.image_inpainting_metric],
}

View File

@@ -1,12 +1,16 @@
# ------------------------------------------------------------------------
# Copyright (c) Alibaba, Inc. and its affiliates.
# ------------------------------------------------------------------------
# modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/metrics/psnr_ssim.py
# ------------------------------------------------------------------------
from typing import Dict
import cv2
import numpy as np
from skimage.metrics import peak_signal_noise_ratio, structural_similarity
import torch
from modelscope.metainfo import Metrics
from modelscope.utils.registry import default_group
from modelscope.utils.tensor_utils import (torch_nested_detach,
torch_nested_numpify)
from .base import Metric
from .builder import METRICS, MetricKeys
@@ -20,26 +24,249 @@ class ImageDenoiseMetric(Metric):
label_name = 'target'
def __init__(self):
super(ImageDenoiseMetric, self).__init__()
self.preds = []
self.labels = []
def add(self, outputs: Dict, inputs: Dict):
ground_truths = outputs[ImageDenoiseMetric.label_name]
eval_results = outputs[ImageDenoiseMetric.pred_name]
self.preds.append(
torch_nested_numpify(torch_nested_detach(eval_results)))
self.labels.append(
torch_nested_numpify(torch_nested_detach(ground_truths)))
self.preds.append(eval_results)
self.labels.append(ground_truths)
def evaluate(self):
psnr_list, ssim_list = [], []
for (pred, label) in zip(self.preds, self.labels):
psnr_list.append(
peak_signal_noise_ratio(label[0], pred[0], data_range=255))
ssim_list.append(
structural_similarity(
label[0], pred[0], multichannel=True, data_range=255))
psnr_list.append(calculate_psnr(label[0], pred[0], crop_border=0))
ssim_list.append(calculate_ssim(label[0], pred[0], crop_border=0))
return {
MetricKeys.PSNR: np.mean(psnr_list),
MetricKeys.SSIM: np.mean(ssim_list)
}
def reorder_image(img, input_order='HWC'):
"""Reorder images to 'HWC' order.
If the input_order is (h, w), return (h, w, 1);
If the input_order is (c, h, w), return (h, w, c);
If the input_order is (h, w, c), return as it is.
Args:
img (ndarray): Input image.
input_order (str): Whether the input order is 'HWC' or 'CHW'.
If the input image shape is (h, w), input_order will not have
effects. Default: 'HWC'.
Returns:
ndarray: reordered image.
"""
if input_order not in ['HWC', 'CHW']:
raise ValueError(
f"Wrong input_order {input_order}. Supported input_orders are 'HWC' and 'CHW'"
)
if len(img.shape) == 2:
img = img[..., None]
if input_order == 'CHW':
img = img.transpose(1, 2, 0)
return img
def calculate_psnr(img1, img2, crop_border, input_order='HWC'):
"""Calculate PSNR (Peak Signal-to-Noise Ratio).
Ref: https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio
Args:
img1 (ndarray/tensor): Images with range [0, 255]/[0, 1].
img2 (ndarray/tensor): Images with range [0, 255]/[0, 1].
crop_border (int): Cropped pixels in each edge of an image. These
pixels are not involved in the PSNR calculation.
input_order (str): Whether the input order is 'HWC' or 'CHW'.
Default: 'HWC'.
test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
Returns:
float: psnr result.
"""
assert img1.shape == img2.shape, (
f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
if input_order not in ['HWC', 'CHW']:
raise ValueError(
f'Wrong input_order {input_order}. Supported input_orders are '
'"HWC" and "CHW"')
if type(img1) == torch.Tensor:
if len(img1.shape) == 4:
img1 = img1.squeeze(0)
img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
if type(img2) == torch.Tensor:
if len(img2.shape) == 4:
img2 = img2.squeeze(0)
img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)
img1 = reorder_image(img1, input_order=input_order)
img2 = reorder_image(img2, input_order=input_order)
img1 = img1.astype(np.float64)
img2 = img2.astype(np.float64)
if crop_border != 0:
img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
def _psnr(img1, img2):
mse = np.mean((img1 - img2)**2)
if mse == 0:
return float('inf')
max_value = 1. if img1.max() <= 1 else 255.
return 20. * np.log10(max_value / np.sqrt(mse))
return _psnr(img1, img2)
def calculate_ssim(img1, img2, crop_border, input_order='HWC', ssim3d=True):
"""Calculate SSIM (structural similarity).
Ref:
Image quality assessment: From error visibility to structural similarity
The results are the same as that of the official released MATLAB code in
https://ece.uwaterloo.ca/~z70wang/research/ssim/.
For three-channel images, SSIM is calculated for each channel and then
averaged.
Args:
img1 (ndarray): Images with range [0, 255].
img2 (ndarray): Images with range [0, 255].
crop_border (int): Cropped pixels in each edge of an image. These
pixels are not involved in the SSIM calculation.
input_order (str): Whether the input order is 'HWC' or 'CHW'.
Default: 'HWC'.
test_y_channel (bool): Test on Y channel of YCbCr. Default: False.
Returns:
float: ssim result.
"""
assert img1.shape == img2.shape, (
f'Image shapes are differnet: {img1.shape}, {img2.shape}.')
if input_order not in ['HWC', 'CHW']:
raise ValueError(
f'Wrong input_order {input_order}. Supported input_orders are '
'"HWC" and "CHW"')
if type(img1) == torch.Tensor:
if len(img1.shape) == 4:
img1 = img1.squeeze(0)
img1 = img1.detach().cpu().numpy().transpose(1, 2, 0)
if type(img2) == torch.Tensor:
if len(img2.shape) == 4:
img2 = img2.squeeze(0)
img2 = img2.detach().cpu().numpy().transpose(1, 2, 0)
img1 = reorder_image(img1, input_order=input_order)
img2 = reorder_image(img2, input_order=input_order)
img1 = img1.astype(np.float64)
img2 = img2.astype(np.float64)
if crop_border != 0:
img1 = img1[crop_border:-crop_border, crop_border:-crop_border, ...]
img2 = img2[crop_border:-crop_border, crop_border:-crop_border, ...]
def _cal_ssim(img1, img2):
ssims = []
max_value = 1 if img1.max() <= 1 else 255
with torch.no_grad():
final_ssim = _ssim_3d(img1, img2, max_value) if ssim3d else _ssim(
img1, img2, max_value)
ssims.append(final_ssim)
return np.array(ssims).mean()
return _cal_ssim(img1, img2)
def _ssim(img, img2, max_value):
"""Calculate SSIM (structural similarity) for one channel images.
It is called by func:`calculate_ssim`.
Args:
img (ndarray): Images with range [0, 255] with order 'HWC'.
img2 (ndarray): Images with range [0, 255] with order 'HWC'.
Returns:
float: SSIM result.
"""
c1 = (0.01 * max_value)**2
c2 = (0.03 * max_value)**2
img = img.astype(np.float64)
img2 = img2.astype(np.float64)
kernel = cv2.getGaussianKernel(11, 1.5)
window = np.outer(kernel, kernel.transpose())
mu1 = cv2.filter2D(img, -1, window)[5:-5,
5:-5] # valid mode for window size 11
mu2 = cv2.filter2D(img2, -1, window)[5:-5, 5:-5]
mu1_sq = mu1**2
mu2_sq = mu2**2
mu1_mu2 = mu1 * mu2
sigma1_sq = cv2.filter2D(img**2, -1, window)[5:-5, 5:-5] - mu1_sq
sigma2_sq = cv2.filter2D(img2**2, -1, window)[5:-5, 5:-5] - mu2_sq
sigma12 = cv2.filter2D(img * img2, -1, window)[5:-5, 5:-5] - mu1_mu2
tmp1 = (2 * mu1_mu2 + c1) * (2 * sigma12 + c2)
tmp2 = (mu1_sq + mu2_sq + c1) * (sigma1_sq + sigma2_sq + c2)
ssim_map = tmp1 / tmp2
return ssim_map.mean()
def _3d_gaussian_calculator(img, conv3d):
out = conv3d(img.unsqueeze(0).unsqueeze(0)).squeeze(0).squeeze(0)
return out
def _generate_3d_gaussian_kernel():
kernel = cv2.getGaussianKernel(11, 1.5)
window = np.outer(kernel, kernel.transpose())
kernel_3 = cv2.getGaussianKernel(11, 1.5)
kernel = torch.tensor(np.stack([window * k for k in kernel_3], axis=0))
conv3d = torch.nn.Conv3d(
1,
1, (11, 11, 11),
stride=1,
padding=(5, 5, 5),
bias=False,
padding_mode='replicate')
conv3d.weight.requires_grad = False
conv3d.weight[0, 0, :, :, :] = kernel
return conv3d
def _ssim_3d(img1, img2, max_value):
assert len(img1.shape) == 3 and len(img2.shape) == 3
"""Calculate SSIM (structural similarity) for one channel images.
It is called by func:`calculate_ssim`.
Args:
img1 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
img2 (ndarray): Images with range [0, 255]/[0, 1] with order 'HWC'.
Returns:
float: ssim result.
"""
C1 = (0.01 * max_value)**2
C2 = (0.03 * max_value)**2
img1 = img1.astype(np.float64)
img2 = img2.astype(np.float64)
kernel = _generate_3d_gaussian_kernel().cuda()
img1 = torch.tensor(img1).float().cuda()
img2 = torch.tensor(img2).float().cuda()
mu1 = _3d_gaussian_calculator(img1, kernel)
mu2 = _3d_gaussian_calculator(img2, kernel)
mu1_sq = mu1**2
mu2_sq = mu2**2
mu1_mu2 = mu1 * mu2
sigma1_sq = _3d_gaussian_calculator(img1**2, kernel) - mu1_sq
sigma2_sq = _3d_gaussian_calculator(img2**2, kernel) - mu2_sq
sigma12 = _3d_gaussian_calculator(img1 * img2, kernel) - mu1_mu2
tmp1 = (2 * mu1_mu2 + C1) * (2 * sigma12 + C2)
tmp2 = (mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2)
ssim_map = tmp1 / tmp2
return float(ssim_map.mean())

View File

@@ -0,0 +1,210 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
from typing import Dict
import numpy as np
import torch
import torch.nn.functional as F
from scipy import linalg
from modelscope.metainfo import Metrics
from modelscope.models.cv.image_inpainting.modules.inception import InceptionV3
from modelscope.utils.registry import default_group
from modelscope.utils.tensor_utils import (torch_nested_detach,
torch_nested_numpify)
from .base import Metric
from .builder import METRICS, MetricKeys
def fid_calculate_activation_statistics(act):
mu = np.mean(act, axis=0)
sigma = np.cov(act, rowvar=False)
return mu, sigma
def calculate_frechet_distance(activations_pred, activations_target, eps=1e-6):
mu1, sigma1 = fid_calculate_activation_statistics(activations_pred)
mu2, sigma2 = fid_calculate_activation_statistics(activations_target)
diff = mu1 - mu2
# Product might be almost singular
covmean, _ = linalg.sqrtm(sigma1.dot(sigma2), disp=False)
if not np.isfinite(covmean).all():
offset = np.eye(sigma1.shape[0]) * eps
covmean = linalg.sqrtm((sigma1 + offset).dot(sigma2 + offset))
# Numerical error might give slight imaginary component
if np.iscomplexobj(covmean):
# if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-3):
if not np.allclose(np.diagonal(covmean).imag, 0, atol=1e-2):
m = np.max(np.abs(covmean.imag))
raise ValueError('Imaginary component {}'.format(m))
covmean = covmean.real
tr_covmean = np.trace(covmean)
return (diff.dot(diff) + np.trace(sigma1) + np.trace(sigma2)
- 2 * tr_covmean)
class FIDScore(torch.nn.Module):
def __init__(self, dims=2048, eps=1e-6):
super().__init__()
if getattr(FIDScore, '_MODEL', None) is None:
block_idx = InceptionV3.BLOCK_INDEX_BY_DIM[dims]
FIDScore._MODEL = InceptionV3([block_idx]).eval()
self.model = FIDScore._MODEL
self.eps = eps
self.reset()
def forward(self, pred_batch, target_batch, mask=None):
activations_pred = self._get_activations(pred_batch)
activations_target = self._get_activations(target_batch)
self.activations_pred.append(activations_pred.detach().cpu())
self.activations_target.append(activations_target.detach().cpu())
def get_value(self):
activations_pred, activations_target = (self.activations_pred,
self.activations_target)
activations_pred = torch.cat(activations_pred).cpu().numpy()
activations_target = torch.cat(activations_target).cpu().numpy()
total_distance = calculate_frechet_distance(
activations_pred, activations_target, eps=self.eps)
self.reset()
return total_distance
def reset(self):
self.activations_pred = []
self.activations_target = []
def _get_activations(self, batch):
activations = self.model(batch)[0]
if activations.shape[2] != 1 or activations.shape[3] != 1:
assert False, \
'We should not have got here, because Inception always scales inputs to 299x299'
activations = activations.squeeze(-1).squeeze(-1)
return activations
class SSIM(torch.nn.Module):
"""SSIM. Modified from:
https://github.com/Po-Hsun-Su/pytorch-ssim/blob/master/pytorch_ssim/__init__.py
"""
def __init__(self, window_size=11, size_average=True):
super().__init__()
self.window_size = window_size
self.size_average = size_average
self.channel = 1
self.register_buffer('window',
self._create_window(window_size, self.channel))
def forward(self, img1, img2):
assert len(img1.shape) == 4
channel = img1.size()[1]
if channel == self.channel and self.window.data.type(
) == img1.data.type():
window = self.window
else:
window = self._create_window(self.window_size, channel)
window = window.type_as(img1)
self.window = window
self.channel = channel
return self._ssim(img1, img2, window, self.window_size, channel,
self.size_average)
def _gaussian(self, window_size, sigma):
gauss = torch.Tensor([
np.exp(-(x - (window_size // 2))**2 / float(2 * sigma**2))
for x in range(window_size)
])
return gauss / gauss.sum()
def _create_window(self, window_size, channel):
_1D_window = self._gaussian(window_size, 1.5).unsqueeze(1)
_2D_window = _1D_window.mm(
_1D_window.t()).float().unsqueeze(0).unsqueeze(0)
return _2D_window.expand(channel, 1, window_size,
window_size).contiguous()
def _ssim(self,
img1,
img2,
window,
window_size,
channel,
size_average=True):
mu1 = F.conv2d(
img1, window, padding=(window_size // 2), groups=channel)
mu2 = F.conv2d(
img2, window, padding=(window_size // 2), groups=channel)
mu1_sq = mu1.pow(2)
mu2_sq = mu2.pow(2)
mu1_mu2 = mu1 * mu2
sigma1_sq = F.conv2d(
img1 * img1, window, padding=(window_size // 2),
groups=channel) - mu1_sq
sigma2_sq = F.conv2d(
img2 * img2, window, padding=(window_size // 2),
groups=channel) - mu2_sq
sigma12 = F.conv2d(
img1 * img2, window, padding=(window_size // 2),
groups=channel) - mu1_mu2
C1 = 0.01**2
C2 = 0.03**2
ssim_map = ((2 * mu1_mu2 + C1) * (2 * sigma12 + C2)) / \
((mu1_sq + mu2_sq + C1) * (sigma1_sq + sigma2_sq + C2))
if size_average:
return ssim_map.mean()
return ssim_map.mean(1).mean(1).mean(1)
def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict,
missing_keys, unexpected_keys, error_msgs):
return
@METRICS.register_module(
group_key=default_group, module_name=Metrics.image_inpainting_metric)
class ImageInpaintingMetric(Metric):
"""The metric computation class for image inpainting classes.
"""
def __init__(self):
self.preds = []
self.targets = []
self.SSIM = SSIM(window_size=11, size_average=False).eval()
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.FID = FIDScore().to(device)
def add(self, outputs: Dict, inputs: Dict):
pred = outputs['inpainted']
target = inputs['image']
self.preds.append(torch_nested_detach(pred))
self.targets.append(torch_nested_detach(target))
def evaluate(self):
ssim_list = []
for (pred, target) in zip(self.preds, self.targets):
ssim_list.append(self.SSIM(pred, target))
self.FID(pred, target)
ssim_list = torch_nested_numpify(ssim_list)
fid = self.FID.get_value()
return {MetricKeys.SSIM: np.mean(ssim_list), MetricKeys.FID: fid}

View File

@@ -1,3 +1,6 @@
# Part of the implementation is borrowed and modified from PGL-SUM,
# publicly available at https://github.com/e-apostolidis/PGL-SUM
from typing import Dict
import numpy as np

View File

@@ -1,3 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict

View File

@@ -1,15 +1,14 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Dict
import torch
from typing import Dict, Optional
from modelscope.metainfo import Models
from modelscope.models import TorchModel
from modelscope.models.base import Tensor
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.audio.audio_utils import update_conf
from modelscope.utils.constant import Tasks
from .fsmn_sele_v2 import FSMNSeleNetV2
@@ -20,48 +19,38 @@ class FSMNSeleNetV2Decorator(TorchModel):
MODEL_TXT = 'model.txt'
SC_CONFIG = 'sound_connect.conf'
SC_CONF_ITEM_KWS_MODEL = '${kws_model}'
def __init__(self, model_dir: str, *args, **kwargs):
def __init__(self,
model_dir: str,
training: Optional[bool] = False,
*args,
**kwargs):
"""initialize the dfsmn model from the `model_dir` path.
Args:
model_dir (str): the model path.
"""
super().__init__(model_dir, *args, **kwargs)
sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
model_bin_file = os.path.join(model_dir,
ModelFile.TORCH_MODEL_BIN_FILE)
self._model = None
if os.path.exists(model_bin_file):
kwargs.pop('device')
self._model = FSMNSeleNetV2(*args, **kwargs)
checkpoint = torch.load(model_bin_file)
self._model.load_state_dict(checkpoint, strict=False)
self._sc = None
if os.path.exists(model_txt_file):
with open(sc_config_file) as f:
lines = f.readlines()
with open(sc_config_file, 'w') as f:
for line in lines:
if self.SC_CONF_ITEM_KWS_MODEL in line:
line = line.replace(self.SC_CONF_ITEM_KWS_MODEL,
model_txt_file)
f.write(line)
import py_sound_connect
self._sc = py_sound_connect.SoundConnect(sc_config_file)
self.size_in = self._sc.bytesPerBlockIn()
self.size_out = self._sc.bytesPerBlockOut()
if self._model is None and self._sc is None:
raise Exception(
f'Invalid model directory! Neither {model_txt_file} nor {model_bin_file} exists.'
)
if training:
self.model = FSMNSeleNetV2(*args, **kwargs)
else:
sc_config_file = os.path.join(model_dir, self.SC_CONFIG)
model_txt_file = os.path.join(model_dir, self.MODEL_TXT)
self._sc = None
if os.path.exists(model_txt_file):
conf_dict = dict(mode=56542, kws_model=model_txt_file)
update_conf(sc_config_file, sc_config_file, conf_dict)
import py_sound_connect
self._sc = py_sound_connect.SoundConnect(sc_config_file)
self.size_in = self._sc.bytesPerBlockIn()
self.size_out = self._sc.bytesPerBlockOut()
else:
raise Exception(
f'Invalid model directory! Failed to load model file: {model_txt_file}.'
)
def forward(self, input: Dict[str, Tensor]) -> Dict[str, Tensor]:
...
return self.model.forward(input)
def forward_decode(self, data: bytes):
result = {'pcm': self._sc.process(data, self.size_out)}

View File

@@ -1,3 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict

View File

View File

@@ -4,14 +4,16 @@
from . import (action_recognition, animal_recognition, body_2d_keypoints,
body_3d_keypoints, cartoon, cmdssl_video_embedding,
crowd_counting, face_2d_keypoints, face_detection,
face_generation, image_classification, image_color_enhance,
image_colorization, image_denoise, image_instance_segmentation,
face_generation, human_wholebody_keypoint, image_classification,
image_color_enhance, image_colorization, image_denoise,
image_inpainting, image_instance_segmentation,
image_panoptic_segmentation, image_portrait_enhancement,
image_reid_person, image_semantic_segmentation,
image_to_image_generation, image_to_image_translation,
movie_scene_segmentation, object_detection,
product_retrieval_embedding, realtime_object_detection,
salient_detection, shop_segmentation, super_resolution,
referring_video_object_segmentation, salient_detection,
shop_segmentation, super_resolution,
video_single_object_tracking, video_summarization, virual_tryon)
# yapf: enable

View File

@@ -1,3 +1,5 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict, Optional, Union

View File

@@ -1,10 +1,10 @@
# ------------------------------------------------------------------------------
# Copyright (c) Microsoft
# Licensed under the MIT License.
# Written by Bin Xiao (Bin.Xiao@microsoft.com)
# Modified by Ke Sun (sunk@mail.ustc.edu.cn)
# https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
# ------------------------------------------------------------------------------
"""
Copyright (c) Microsoft
Licensed under the MIT License.
Written by Bin Xiao (Bin.Xiao@microsoft.com)
Modified by Ke Sun (sunk@mail.ustc.edu.cn)
https://github.com/HRNet/HRNet-Image-Classification/blob/master/lib/models/cls_hrnet.py
"""
import functools
import logging

View File

@@ -8,12 +8,14 @@ if TYPE_CHECKING:
from .mtcnn import MtcnnFaceDetector
from .retinaface import RetinaFaceDetection
from .ulfd_slim import UlfdFaceDetector
from .scrfd import ScrfdDetect
else:
_import_structure = {
'ulfd_slim': ['UlfdFaceDetector'],
'retinaface': ['RetinaFaceDetection'],
'mtcnn': ['MtcnnFaceDetector'],
'mogface': ['MogFaceDetector']
'mogface': ['MogFaceDetector'],
'scrfd': ['ScrfdDetect']
}
import sys

View File

@@ -1,189 +0,0 @@
"""
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
"""
import numpy as np
from mmdet.datasets.builder import PIPELINES
from numpy import random
@PIPELINES.register_module()
class RandomSquareCrop(object):
"""Random crop the image & bboxes, the cropped patches have minimum IoU
requirement with original image & bboxes, the IoU threshold is randomly
selected from min_ious.
Args:
min_ious (tuple): minimum IoU threshold for all intersections with
bounding boxes
min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
where a >= min_crop_size).
Note:
The keys for bboxes, labels and masks should be paired. That is, \
`gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
`gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
"""
def __init__(self,
crop_ratio_range=None,
crop_choice=None,
bbox_clip_border=True):
self.crop_ratio_range = crop_ratio_range
self.crop_choice = crop_choice
self.bbox_clip_border = bbox_clip_border
assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
if self.crop_ratio_range is not None:
self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
self.bbox2label = {
'gt_bboxes': 'gt_labels',
'gt_bboxes_ignore': 'gt_labels_ignore'
}
self.bbox2mask = {
'gt_bboxes': 'gt_masks',
'gt_bboxes_ignore': 'gt_masks_ignore'
}
def __call__(self, results):
"""Call function to crop images and bounding boxes with minimum IoU
constraint.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images and bounding boxes cropped, \
'img_shape' key is updated.
"""
if 'img_fields' in results:
assert results['img_fields'] == ['img'], \
'Only single img_fields is allowed'
img = results['img']
assert 'bbox_fields' in results
assert 'gt_bboxes' in results
boxes = results['gt_bboxes']
h, w, c = img.shape
scale_retry = 0
if self.crop_ratio_range is not None:
max_scale = self.crop_ratio_max
else:
max_scale = np.amax(self.crop_choice)
while True:
scale_retry += 1
if scale_retry == 1 or max_scale > 1.0:
if self.crop_ratio_range is not None:
scale = np.random.uniform(self.crop_ratio_min,
self.crop_ratio_max)
elif self.crop_choice is not None:
scale = np.random.choice(self.crop_choice)
else:
scale = scale * 1.2
for i in range(250):
short_side = min(w, h)
cw = int(scale * short_side)
ch = cw
# TODO +1
if w == cw:
left = 0
elif w > cw:
left = random.randint(0, w - cw)
else:
left = random.randint(w - cw, 0)
if h == ch:
top = 0
elif h > ch:
top = random.randint(0, h - ch)
else:
top = random.randint(h - ch, 0)
patch = np.array(
(int(left), int(top), int(left + cw), int(top + ch)),
dtype=np.int)
# center of boxes should inside the crop img
# only adjust boxes and instance masks when the gt is not empty
# adjust boxes
def is_center_of_bboxes_in_patch(boxes, patch):
# TODO >=
center = (boxes[:, :2] + boxes[:, 2:]) / 2
mask = \
((center[:, 0] > patch[0])
* (center[:, 1] > patch[1])
* (center[:, 0] < patch[2])
* (center[:, 1] < patch[3]))
return mask
mask = is_center_of_bboxes_in_patch(boxes, patch)
if not mask.any():
continue
for key in results.get('bbox_fields', []):
boxes = results[key].copy()
mask = is_center_of_bboxes_in_patch(boxes, patch)
boxes = boxes[mask]
if self.bbox_clip_border:
boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
boxes -= np.tile(patch[:2], 2)
results[key] = boxes
# labels
label_key = self.bbox2label.get(key)
if label_key in results:
results[label_key] = results[label_key][mask]
# keypoints field
if key == 'gt_bboxes':
for kps_key in results.get('keypoints_fields', []):
keypointss = results[kps_key].copy()
keypointss = keypointss[mask, :, :]
if self.bbox_clip_border:
keypointss[:, :, :
2] = keypointss[:, :, :2].clip(
max=patch[2:])
keypointss[:, :, :
2] = keypointss[:, :, :2].clip(
min=patch[:2])
keypointss[:, :, 0] -= patch[0]
keypointss[:, :, 1] -= patch[1]
results[kps_key] = keypointss
# mask fields
mask_key = self.bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][mask.nonzero()
[0]].crop(patch)
# adjust the img no matter whether the gt is empty before crop
rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
patch_from = patch.copy()
patch_from[0] = max(0, patch_from[0])
patch_from[1] = max(0, patch_from[1])
patch_from[2] = min(img.shape[1], patch_from[2])
patch_from[3] = min(img.shape[0], patch_from[3])
patch_to = patch.copy()
patch_to[0] = max(0, patch_to[0] * -1)
patch_to[1] = max(0, patch_to[1] * -1)
patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
rimg[patch_to[1]:patch_to[3],
patch_to[0]:patch_to[2], :] = img[
patch_from[1]:patch_from[3],
patch_from[0]:patch_from[2], :]
img = rimg
results['img'] = img
results['img_shape'] = img.shape
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(min_ious={self.min_iou}, '
repr_str += f'crop_size={self.crop_size})'
return repr_str

View File

@@ -1,3 +1,5 @@
# The implementation is based on MogFace, available at
# https://github.com/damo-cv/MogFace
import os
import cv2

View File

@@ -0,0 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .scrfd_detect import ScrfdDetect

View File

@@ -6,7 +6,7 @@ import numpy as np
import torch
def bbox2result(bboxes, labels, num_classes, kps=None):
def bbox2result(bboxes, labels, num_classes, kps=None, num_kps=5):
"""Convert detection results to a list of numpy arrays.
Args:
@@ -17,7 +17,7 @@ def bbox2result(bboxes, labels, num_classes, kps=None):
Returns:
list(ndarray): bbox results of each class
"""
bbox_len = 5 if kps is None else 5 + 10 # if has kps, add 10 kps into bbox
bbox_len = 5 if kps is None else 5 + num_kps * 2 # if has kps, add num_kps*2 into bbox
if bboxes.shape[0] == 0:
return [
np.zeros((0, bbox_len), dtype=np.float32)

View File

@@ -17,6 +17,7 @@ def multiclass_nms(multi_bboxes,
Args:
multi_bboxes (Tensor): shape (n, #class*4) or (n, 4)
multi_kps (Tensor): shape (n, #class*num_kps*2) or (n, num_kps*2)
multi_scores (Tensor): shape (n, #class), where the last column
contains scores of the background class, but this will be ignored.
score_thr (float): bbox threshold, bboxes with scores lower than it
@@ -36,16 +37,18 @@ def multiclass_nms(multi_bboxes,
num_classes = multi_scores.size(1) - 1
# exclude background category
kps = None
if multi_kps is not None:
num_kps = int((multi_kps.shape[1] / num_classes) / 2)
if multi_bboxes.shape[1] > 4:
bboxes = multi_bboxes.view(multi_scores.size(0), -1, 4)
if multi_kps is not None:
kps = multi_kps.view(multi_scores.size(0), -1, 10)
kps = multi_kps.view(multi_scores.size(0), -1, num_kps * 2)
else:
bboxes = multi_bboxes[:, None].expand(
multi_scores.size(0), num_classes, 4)
if multi_kps is not None:
kps = multi_kps[:, None].expand(
multi_scores.size(0), num_classes, 10)
multi_scores.size(0), num_classes, num_kps * 2)
scores = multi_scores[:, :-1]
if score_factors is not None:
@@ -56,7 +59,7 @@ def multiclass_nms(multi_bboxes,
bboxes = bboxes.reshape(-1, 4)
if kps is not None:
kps = kps.reshape(-1, 10)
kps = kps.reshape(-1, num_kps * 2)
scores = scores.reshape(-1)
labels = labels.reshape(-1)

View File

@@ -2,6 +2,12 @@
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines
"""
from .auto_augment import RotateV2
from .formating import DefaultFormatBundleV2
from .loading import LoadAnnotationsV2
from .transforms import RandomSquareCrop
__all__ = ['RandomSquareCrop']
__all__ = [
'RandomSquareCrop', 'LoadAnnotationsV2', 'RotateV2',
'DefaultFormatBundleV2'
]

View File

@@ -0,0 +1,271 @@
"""
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/auto_augment.py
"""
import copy
import cv2
import mmcv
import numpy as np
from mmdet.datasets.builder import PIPELINES
_MAX_LEVEL = 10
def level_to_value(level, max_value):
"""Map from level to values based on max_value."""
return (level / _MAX_LEVEL) * max_value
def random_negative(value, random_negative_prob):
"""Randomly negate value based on random_negative_prob."""
return -value if np.random.rand() < random_negative_prob else value
def bbox2fields():
"""The key correspondence from bboxes to labels, masks and
segmentations."""
bbox2label = {
'gt_bboxes': 'gt_labels',
'gt_bboxes_ignore': 'gt_labels_ignore'
}
bbox2mask = {
'gt_bboxes': 'gt_masks',
'gt_bboxes_ignore': 'gt_masks_ignore'
}
bbox2seg = {
'gt_bboxes': 'gt_semantic_seg',
}
return bbox2label, bbox2mask, bbox2seg
@PIPELINES.register_module()
class RotateV2(object):
"""Apply Rotate Transformation to image (and its corresponding bbox, mask,
segmentation).
Args:
level (int | float): The level should be in range (0,_MAX_LEVEL].
scale (int | float): Isotropic scale factor. Same in
``mmcv.imrotate``.
center (int | float | tuple[float]): Center point (w, h) of the
rotation in the source image. If None, the center of the
image will be used. Same in ``mmcv.imrotate``.
img_fill_val (int | float | tuple): The fill value for image border.
If float, the same value will be used for all the three
channels of image. If tuple, the should be 3 elements (e.g.
equals the number of channels for image).
seg_ignore_label (int): The fill value used for segmentation map.
Note this value must equals ``ignore_label`` in ``semantic_head``
of the corresponding config. Default 255.
prob (float): The probability for perform transformation and
should be in range 0 to 1.
max_rotate_angle (int | float): The maximum angles for rotate
transformation.
random_negative_prob (float): The probability that turns the
offset negative.
"""
def __init__(self,
level,
scale=1,
center=None,
img_fill_val=128,
seg_ignore_label=255,
prob=0.5,
max_rotate_angle=30,
random_negative_prob=0.5):
assert isinstance(level, (int, float)), \
f'The level must be type int or float. got {type(level)}.'
assert 0 <= level <= _MAX_LEVEL, \
f'The level should be in range (0,{_MAX_LEVEL}]. got {level}.'
assert isinstance(scale, (int, float)), \
f'The scale must be type int or float. got type {type(scale)}.'
if isinstance(center, (int, float)):
center = (center, center)
elif isinstance(center, tuple):
assert len(center) == 2, 'center with type tuple must have '\
f'2 elements. got {len(center)} elements.'
else:
assert center is None, 'center must be None or type int, '\
f'float or tuple, got type {type(center)}.'
if isinstance(img_fill_val, (float, int)):
img_fill_val = tuple([float(img_fill_val)] * 3)
elif isinstance(img_fill_val, tuple):
assert len(img_fill_val) == 3, 'img_fill_val as tuple must '\
f'have 3 elements. got {len(img_fill_val)}.'
img_fill_val = tuple([float(val) for val in img_fill_val])
else:
raise ValueError(
'img_fill_val must be float or tuple with 3 elements.')
assert np.all([0 <= val <= 255 for val in img_fill_val]), \
'all elements of img_fill_val should between range [0,255]. '\
f'got {img_fill_val}.'
assert 0 <= prob <= 1.0, 'The probability should be in range [0,1]. '\
f'got {prob}.'
assert isinstance(max_rotate_angle, (int, float)), 'max_rotate_angle '\
f'should be type int or float. got type {type(max_rotate_angle)}.'
self.level = level
self.scale = scale
# Rotation angle in degrees. Positive values mean
# clockwise rotation.
self.angle = level_to_value(level, max_rotate_angle)
self.center = center
self.img_fill_val = img_fill_val
self.seg_ignore_label = seg_ignore_label
self.prob = prob
self.max_rotate_angle = max_rotate_angle
self.random_negative_prob = random_negative_prob
def _rotate_img(self, results, angle, center=None, scale=1.0):
"""Rotate the image.
Args:
results (dict): Result dict from loading pipeline.
angle (float): Rotation angle in degrees, positive values
mean clockwise rotation. Same in ``mmcv.imrotate``.
center (tuple[float], optional): Center point (w, h) of the
rotation. Same in ``mmcv.imrotate``.
scale (int | float): Isotropic scale factor. Same in
``mmcv.imrotate``.
"""
for key in results.get('img_fields', ['img']):
img = results[key].copy()
img_rotated = mmcv.imrotate(
img, angle, center, scale, border_value=self.img_fill_val)
results[key] = img_rotated.astype(img.dtype)
results['img_shape'] = results[key].shape
def _rotate_bboxes(self, results, rotate_matrix):
"""Rotate the bboxes."""
h, w, c = results['img_shape']
for key in results.get('bbox_fields', []):
min_x, min_y, max_x, max_y = np.split(
results[key], results[key].shape[-1], axis=-1)
coordinates = np.stack([[min_x, min_y], [max_x, min_y],
[min_x, max_y],
[max_x, max_y]]) # [4, 2, nb_bbox, 1]
# pad 1 to convert from format [x, y] to homogeneous
# coordinates format [x, y, 1]
coordinates = np.concatenate(
(coordinates,
np.ones((4, 1, coordinates.shape[2], 1), coordinates.dtype)),
axis=1) # [4, 3, nb_bbox, 1]
coordinates = coordinates.transpose(
(2, 0, 1, 3)) # [nb_bbox, 4, 3, 1]
rotated_coords = np.matmul(rotate_matrix,
coordinates) # [nb_bbox, 4, 2, 1]
rotated_coords = rotated_coords[..., 0] # [nb_bbox, 4, 2]
min_x, min_y = np.min(
rotated_coords[:, :, 0], axis=1), np.min(
rotated_coords[:, :, 1], axis=1)
max_x, max_y = np.max(
rotated_coords[:, :, 0], axis=1), np.max(
rotated_coords[:, :, 1], axis=1)
results[key] = np.stack([min_x, min_y, max_x, max_y],
axis=-1).astype(results[key].dtype)
def _rotate_keypoints90(self, results, angle):
"""Rotate the keypoints, only valid when angle in [-90,90,-180,180]"""
if angle not in [-90, 90, 180, -180
] or self.scale != 1 or self.center is not None:
return
for key in results.get('keypoints_fields', []):
k = results[key]
if angle == 90:
w, h, c = results['img'].shape
new = np.stack([h - k[..., 1], k[..., 0], k[..., 2]], axis=-1)
elif angle == -90:
w, h, c = results['img'].shape
new = np.stack([k[..., 1], w - k[..., 0], k[..., 2]], axis=-1)
else:
h, w, c = results['img'].shape
new = np.stack([w - k[..., 0], h - k[..., 1], k[..., 2]],
axis=-1)
# a kps is invalid if thrid value is -1
kps_invalid = new[..., -1][:, -1] == -1
new[kps_invalid] = np.zeros(new.shape[1:]) - 1
results[key] = new
def _rotate_masks(self,
results,
angle,
center=None,
scale=1.0,
fill_val=0):
"""Rotate the masks."""
h, w, c = results['img_shape']
for key in results.get('mask_fields', []):
masks = results[key]
results[key] = masks.rotate((h, w), angle, center, scale, fill_val)
def _rotate_seg(self,
results,
angle,
center=None,
scale=1.0,
fill_val=255):
"""Rotate the segmentation map."""
for key in results.get('seg_fields', []):
seg = results[key].copy()
results[key] = mmcv.imrotate(
seg, angle, center, scale,
border_value=fill_val).astype(seg.dtype)
def _filter_invalid(self, results, min_bbox_size=0):
"""Filter bboxes and corresponding masks too small after rotate
augmentation."""
bbox2label, bbox2mask, _ = bbox2fields()
for key in results.get('bbox_fields', []):
bbox_w = results[key][:, 2] - results[key][:, 0]
bbox_h = results[key][:, 3] - results[key][:, 1]
valid_inds = (bbox_w > min_bbox_size) & (bbox_h > min_bbox_size)
valid_inds = np.nonzero(valid_inds)[0]
results[key] = results[key][valid_inds]
# label fields. e.g. gt_labels and gt_labels_ignore
label_key = bbox2label.get(key)
if label_key in results:
results[label_key] = results[label_key][valid_inds]
# mask fields, e.g. gt_masks and gt_masks_ignore
mask_key = bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][valid_inds]
def __call__(self, results):
"""Call function to rotate images, bounding boxes, masks and semantic
segmentation maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Rotated results.
"""
if np.random.rand() > self.prob:
return results
h, w = results['img'].shape[:2]
center = self.center
if center is None:
center = ((w - 1) * 0.5, (h - 1) * 0.5)
angle = random_negative(self.angle, self.random_negative_prob)
self._rotate_img(results, angle, center, self.scale)
rotate_matrix = cv2.getRotationMatrix2D(center, -angle, self.scale)
self._rotate_bboxes(results, rotate_matrix)
self._rotate_keypoints90(results, angle)
self._rotate_masks(results, angle, center, self.scale, fill_val=0)
self._rotate_seg(
results, angle, center, self.scale, fill_val=self.seg_ignore_label)
self._filter_invalid(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(level={self.level}, '
repr_str += f'scale={self.scale}, '
repr_str += f'center={self.center}, '
repr_str += f'img_fill_val={self.img_fill_val}, '
repr_str += f'seg_ignore_label={self.seg_ignore_label}, '
repr_str += f'prob={self.prob}, '
repr_str += f'max_rotate_angle={self.max_rotate_angle}, '
repr_str += f'random_negative_prob={self.random_negative_prob})'
return repr_str

View File

@@ -0,0 +1,113 @@
"""
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/formating.py
"""
import numpy as np
import torch
from mmcv.parallel import DataContainer as DC
from mmdet.datasets.builder import PIPELINES
def to_tensor(data):
"""Convert objects of various python types to :obj:`torch.Tensor`.
Supported types are: :class:`numpy.ndarray`, :class:`torch.Tensor`,
:class:`Sequence`, :class:`int` and :class:`float`.
Args:
data (torch.Tensor | numpy.ndarray | Sequence | int | float): Data to
be converted.
"""
if isinstance(data, torch.Tensor):
return data
elif isinstance(data, np.ndarray):
return torch.from_numpy(data)
elif isinstance(data, Sequence) and not mmcv.is_str(data):
return torch.tensor(data)
elif isinstance(data, int):
return torch.LongTensor([data])
elif isinstance(data, float):
return torch.FloatTensor([data])
else:
raise TypeError(f'type {type(data)} cannot be converted to tensor.')
@PIPELINES.register_module()
class DefaultFormatBundleV2(object):
"""Default formatting bundle.
It simplifies the pipeline of formatting common fields, including "img",
"proposals", "gt_bboxes", "gt_labels", "gt_masks" and "gt_semantic_seg".
These fields are formatted as follows.
- img: (1)transpose, (2)to tensor, (3)to DataContainer (stack=True)
- proposals: (1)to tensor, (2)to DataContainer
- gt_bboxes: (1)to tensor, (2)to DataContainer
- gt_bboxes_ignore: (1)to tensor, (2)to DataContainer
- gt_labels: (1)to tensor, (2)to DataContainer
- gt_masks: (1)to tensor, (2)to DataContainer (cpu_only=True)
- gt_semantic_seg: (1)unsqueeze dim-0 (2)to tensor, \
(3)to DataContainer (stack=True)
"""
def __call__(self, results):
"""Call function to transform and format common fields in results.
Args:
results (dict): Result dict contains the data to convert.
Returns:
dict: The result dict contains the data that is formatted with \
default bundle.
"""
if 'img' in results:
img = results['img']
# add default meta keys
results = self._add_default_meta_keys(results)
if len(img.shape) < 3:
img = np.expand_dims(img, -1)
img = np.ascontiguousarray(img.transpose(2, 0, 1))
results['img'] = DC(to_tensor(img), stack=True)
for key in [
'proposals', 'gt_bboxes', 'gt_bboxes_ignore', 'gt_keypointss',
'gt_labels'
]:
if key not in results:
continue
results[key] = DC(to_tensor(results[key]))
if 'gt_masks' in results:
results['gt_masks'] = DC(results['gt_masks'], cpu_only=True)
if 'gt_semantic_seg' in results:
results['gt_semantic_seg'] = DC(
to_tensor(results['gt_semantic_seg'][None, ...]), stack=True)
return results
def _add_default_meta_keys(self, results):
"""Add default meta keys.
We set default meta keys including `pad_shape`, `scale_factor` and
`img_norm_cfg` to avoid the case where no `Resize`, `Normalize` and
`Pad` are implemented during the whole pipeline.
Args:
results (dict): Result dict contains the data to convert.
Returns:
results (dict): Updated result dict contains the data to convert.
"""
img = results['img']
results.setdefault('pad_shape', img.shape)
results.setdefault('scale_factor', 1.0)
num_channels = 1 if len(img.shape) < 3 else img.shape[2]
results.setdefault(
'img_norm_cfg',
dict(
mean=np.zeros(num_channels, dtype=np.float32),
std=np.ones(num_channels, dtype=np.float32),
to_rgb=False))
return results
def __repr__(self):
return self.__class__.__name__

View File

@@ -0,0 +1,225 @@
"""
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/loading.py
"""
import os.path as osp
import numpy as np
import pycocotools.mask as maskUtils
from mmdet.core import BitmapMasks, PolygonMasks
from mmdet.datasets.builder import PIPELINES
@PIPELINES.register_module()
class LoadAnnotationsV2(object):
"""Load mutiple types of annotations.
Args:
with_bbox (bool): Whether to parse and load the bbox annotation.
Default: True.
with_label (bool): Whether to parse and load the label annotation.
Default: True.
with_keypoints (bool): Whether to parse and load the keypoints annotation.
Default: False.
with_mask (bool): Whether to parse and load the mask annotation.
Default: False.
with_seg (bool): Whether to parse and load the semantic segmentation
annotation. Default: False.
poly2mask (bool): Whether to convert the instance masks from polygons
to bitmaps. Default: True.
file_client_args (dict): Arguments to instantiate a FileClient.
See :class:`mmcv.fileio.FileClient` for details.
Defaults to ``dict(backend='disk')``.
"""
def __init__(self,
with_bbox=True,
with_label=True,
with_keypoints=False,
with_mask=False,
with_seg=False,
poly2mask=True,
file_client_args=dict(backend='disk')):
self.with_bbox = with_bbox
self.with_label = with_label
self.with_keypoints = with_keypoints
self.with_mask = with_mask
self.with_seg = with_seg
self.poly2mask = poly2mask
self.file_client_args = file_client_args.copy()
self.file_client = None
def _load_bboxes(self, results):
"""Private function to load bounding box annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
ann_info = results['ann_info']
results['gt_bboxes'] = ann_info['bboxes'].copy()
gt_bboxes_ignore = ann_info.get('bboxes_ignore', None)
if gt_bboxes_ignore is not None:
results['gt_bboxes_ignore'] = gt_bboxes_ignore.copy()
results['bbox_fields'].append('gt_bboxes_ignore')
results['bbox_fields'].append('gt_bboxes')
return results
def _load_keypoints(self, results):
"""Private function to load bounding box annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box annotations.
"""
ann_info = results['ann_info']
results['gt_keypointss'] = ann_info['keypointss'].copy()
results['keypoints_fields'] = ['gt_keypointss']
return results
def _load_labels(self, results):
"""Private function to load label annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded label annotations.
"""
results['gt_labels'] = results['ann_info']['labels'].copy()
return results
def _poly2mask(self, mask_ann, img_h, img_w):
"""Private function to convert masks represented with polygon to
bitmaps.
Args:
mask_ann (list | dict): Polygon mask annotation input.
img_h (int): The height of output mask.
img_w (int): The width of output mask.
Returns:
numpy.ndarray: The decode bitmap mask of shape (img_h, img_w).
"""
if isinstance(mask_ann, list):
# polygon -- a single object might consist of multiple parts
# we merge all parts into one mask rle code
rles = maskUtils.frPyObjects(mask_ann, img_h, img_w)
rle = maskUtils.merge(rles)
elif isinstance(mask_ann['counts'], list):
# uncompressed RLE
rle = maskUtils.frPyObjects(mask_ann, img_h, img_w)
else:
# rle
rle = mask_ann
mask = maskUtils.decode(rle)
return mask
def process_polygons(self, polygons):
"""Convert polygons to list of ndarray and filter invalid polygons.
Args:
polygons (list[list]): Polygons of one instance.
Returns:
list[numpy.ndarray]: Processed polygons.
"""
polygons = [np.array(p) for p in polygons]
valid_polygons = []
for polygon in polygons:
if len(polygon) % 2 == 0 and len(polygon) >= 6:
valid_polygons.append(polygon)
return valid_polygons
def _load_masks(self, results):
"""Private function to load mask annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded mask annotations.
If ``self.poly2mask`` is set ``True``, `gt_mask` will contain
:obj:`PolygonMasks`. Otherwise, :obj:`BitmapMasks` is used.
"""
h, w = results['img_info']['height'], results['img_info']['width']
gt_masks = results['ann_info']['masks']
if self.poly2mask:
gt_masks = BitmapMasks(
[self._poly2mask(mask, h, w) for mask in gt_masks], h, w)
else:
gt_masks = PolygonMasks(
[self.process_polygons(polygons) for polygons in gt_masks], h,
w)
results['gt_masks'] = gt_masks
results['mask_fields'].append('gt_masks')
return results
def _load_semantic_seg(self, results):
"""Private function to load semantic segmentation annotations.
Args:
results (dict): Result dict from :obj:`dataset`.
Returns:
dict: The dict contains loaded semantic segmentation annotations.
"""
import mmcv
if self.file_client is None:
self.file_client = mmcv.FileClient(**self.file_client_args)
filename = osp.join(results['seg_prefix'],
results['ann_info']['seg_map'])
img_bytes = self.file_client.get(filename)
results['gt_semantic_seg'] = mmcv.imfrombytes(
img_bytes, flag='unchanged').squeeze()
results['seg_fields'].append('gt_semantic_seg')
return results
def __call__(self, results):
"""Call function to load multiple types annotations.
Args:
results (dict): Result dict from :obj:`mmdet.CustomDataset`.
Returns:
dict: The dict contains loaded bounding box, label, mask and
semantic segmentation annotations.
"""
if self.with_bbox:
results = self._load_bboxes(results)
if results is None:
return None
if self.with_label:
results = self._load_labels(results)
if self.with_keypoints:
results = self._load_keypoints(results)
if self.with_mask:
results = self._load_masks(results)
if self.with_seg:
results = self._load_semantic_seg(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(with_bbox={self.with_bbox}, '
repr_str += f'with_label={self.with_label}, '
repr_str += f'with_keypoints={self.with_keypoints}, '
repr_str += f'with_mask={self.with_mask}, '
repr_str += f'with_seg={self.with_seg})'
repr_str += f'poly2mask={self.poly2mask})'
repr_str += f'poly2mask={self.file_client_args})'
return repr_str

View File

@@ -0,0 +1,737 @@
"""
The implementation here is modified based on insightface, originally MIT license and publicly avaialbe at
https://github.com/deepinsight/insightface/tree/master/detection/scrfd/mmdet/datasets/pipelines/transforms.py
"""
import mmcv
import numpy as np
from mmdet.core.evaluation.bbox_overlaps import bbox_overlaps
from mmdet.datasets.builder import PIPELINES
from numpy import random
@PIPELINES.register_module()
class ResizeV2(object):
"""Resize images & bbox & mask &kps.
This transform resizes the input image to some scale. Bboxes and masks are
then resized with the same scale factor. If the input dict contains the key
"scale", then the scale in the input dict is used, otherwise the specified
scale in the init method is used. If the input dict contains the key
"scale_factor" (if MultiScaleFlipAug does not give img_scale but
scale_factor), the actual scale will be computed by image shape and
scale_factor.
`img_scale` can either be a tuple (single-scale) or a list of tuple
(multi-scale). There are 3 multiscale modes:
- ``ratio_range is not None``: randomly sample a ratio from the ratio \
range and multiply it with the image scale.
- ``ratio_range is None`` and ``multiscale_mode == "range"``: randomly \
sample a scale from the multiscale range.
- ``ratio_range is None`` and ``multiscale_mode == "value"``: randomly \
sample a scale from multiple scales.
Args:
img_scale (tuple or list[tuple]): Images scales for resizing.
multiscale_mode (str): Either "range" or "value".
ratio_range (tuple[float]): (min_ratio, max_ratio)
keep_ratio (bool): Whether to keep the aspect ratio when resizing the
image.
bbox_clip_border (bool, optional): Whether clip the objects outside
the border of the image. Defaults to True.
backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
These two backends generates slightly different results. Defaults
to 'cv2'.
override (bool, optional): Whether to override `scale` and
`scale_factor` so as to call resize twice. Default False. If True,
after the first resizing, the existed `scale` and `scale_factor`
will be ignored so the second resizing can be allowed.
This option is a work-around for multiple times of resize in DETR.
Defaults to False.
"""
def __init__(self,
img_scale=None,
multiscale_mode='range',
ratio_range=None,
keep_ratio=True,
bbox_clip_border=True,
backend='cv2',
override=False):
if img_scale is None:
self.img_scale = None
else:
if isinstance(img_scale, list):
self.img_scale = img_scale
else:
self.img_scale = [img_scale]
assert mmcv.is_list_of(self.img_scale, tuple)
if ratio_range is not None:
# mode 1: given a scale and a range of image ratio
assert len(self.img_scale) == 1
else:
# mode 2: given multiple scales or a range of scales
assert multiscale_mode in ['value', 'range']
self.backend = backend
self.multiscale_mode = multiscale_mode
self.ratio_range = ratio_range
self.keep_ratio = keep_ratio
# TODO: refactor the override option in Resize
self.override = override
self.bbox_clip_border = bbox_clip_border
@staticmethod
def random_select(img_scales):
"""Randomly select an img_scale from given candidates.
Args:
img_scales (list[tuple]): Images scales for selection.
Returns:
(tuple, int): Returns a tuple ``(img_scale, scale_dix)``, \
where ``img_scale`` is the selected image scale and \
``scale_idx`` is the selected index in the given candidates.
"""
assert mmcv.is_list_of(img_scales, tuple)
scale_idx = np.random.randint(len(img_scales))
img_scale = img_scales[scale_idx]
return img_scale, scale_idx
@staticmethod
def random_sample(img_scales):
"""Randomly sample an img_scale when ``multiscale_mode=='range'``.
Args:
img_scales (list[tuple]): Images scale range for sampling.
There must be two tuples in img_scales, which specify the lower
and uper bound of image scales.
Returns:
(tuple, None): Returns a tuple ``(img_scale, None)``, where \
``img_scale`` is sampled scale and None is just a placeholder \
to be consistent with :func:`random_select`.
"""
assert mmcv.is_list_of(img_scales, tuple) and len(img_scales) == 2
img_scale_long = [max(s) for s in img_scales]
img_scale_short = [min(s) for s in img_scales]
long_edge = np.random.randint(
min(img_scale_long),
max(img_scale_long) + 1)
short_edge = np.random.randint(
min(img_scale_short),
max(img_scale_short) + 1)
img_scale = (long_edge, short_edge)
return img_scale, None
@staticmethod
def random_sample_ratio(img_scale, ratio_range):
"""Randomly sample an img_scale when ``ratio_range`` is specified.
A ratio will be randomly sampled from the range specified by
``ratio_range``. Then it would be multiplied with ``img_scale`` to
generate sampled scale.
Args:
img_scale (tuple): Images scale base to multiply with ratio.
ratio_range (tuple[float]): The minimum and maximum ratio to scale
the ``img_scale``.
Returns:
(tuple, None): Returns a tuple ``(scale, None)``, where \
``scale`` is sampled ratio multiplied with ``img_scale`` and \
None is just a placeholder to be consistent with \
:func:`random_select`.
"""
assert isinstance(img_scale, tuple) and len(img_scale) == 2
min_ratio, max_ratio = ratio_range
assert min_ratio <= max_ratio
ratio = np.random.random_sample() * (max_ratio - min_ratio) + min_ratio
scale = int(img_scale[0] * ratio), int(img_scale[1] * ratio)
return scale, None
def _random_scale(self, results):
"""Randomly sample an img_scale according to ``ratio_range`` and
``multiscale_mode``.
If ``ratio_range`` is specified, a ratio will be sampled and be
multiplied with ``img_scale``.
If multiple scales are specified by ``img_scale``, a scale will be
sampled according to ``multiscale_mode``.
Otherwise, single scale will be used.
Args:
results (dict): Result dict from :obj:`dataset`.
Returns:
dict: Two new keys 'scale` and 'scale_idx` are added into \
``results``, which would be used by subsequent pipelines.
"""
if self.ratio_range is not None:
scale, scale_idx = self.random_sample_ratio(
self.img_scale[0], self.ratio_range)
elif len(self.img_scale) == 1:
scale, scale_idx = self.img_scale[0], 0
elif self.multiscale_mode == 'range':
scale, scale_idx = self.random_sample(self.img_scale)
elif self.multiscale_mode == 'value':
scale, scale_idx = self.random_select(self.img_scale)
else:
raise NotImplementedError
results['scale'] = scale
results['scale_idx'] = scale_idx
def _resize_img(self, results):
"""Resize images with ``results['scale']``."""
for key in results.get('img_fields', ['img']):
if self.keep_ratio:
img, scale_factor = mmcv.imrescale(
results[key],
results['scale'],
return_scale=True,
backend=self.backend)
# the w_scale and h_scale has minor difference
# a real fix should be done in the mmcv.imrescale in the future
new_h, new_w = img.shape[:2]
h, w = results[key].shape[:2]
w_scale = new_w / w
h_scale = new_h / h
else:
img, w_scale, h_scale = mmcv.imresize(
results[key],
results['scale'],
return_scale=True,
backend=self.backend)
results[key] = img
scale_factor = np.array([w_scale, h_scale, w_scale, h_scale],
dtype=np.float32)
results['img_shape'] = img.shape
# in case that there is no padding
results['pad_shape'] = img.shape
results['scale_factor'] = scale_factor
results['keep_ratio'] = self.keep_ratio
def _resize_bboxes(self, results):
"""Resize bounding boxes with ``results['scale_factor']``."""
for key in results.get('bbox_fields', []):
bboxes = results[key] * results['scale_factor']
if self.bbox_clip_border:
img_shape = results['img_shape']
bboxes[:, 0::2] = np.clip(bboxes[:, 0::2], 0, img_shape[1])
bboxes[:, 1::2] = np.clip(bboxes[:, 1::2], 0, img_shape[0])
results[key] = bboxes
def _resize_keypoints(self, results):
"""Resize keypoints with ``results['scale_factor']``."""
for key in results.get('keypoints_fields', []):
keypointss = results[key].copy()
factors = results['scale_factor']
assert factors[0] == factors[2]
assert factors[1] == factors[3]
keypointss[:, :, 0] *= factors[0]
keypointss[:, :, 1] *= factors[1]
if self.bbox_clip_border:
img_shape = results['img_shape']
keypointss[:, :, 0] = np.clip(keypointss[:, :, 0], 0,
img_shape[1])
keypointss[:, :, 1] = np.clip(keypointss[:, :, 1], 0,
img_shape[0])
results[key] = keypointss
def _resize_masks(self, results):
"""Resize masks with ``results['scale']``"""
for key in results.get('mask_fields', []):
if results[key] is None:
continue
if self.keep_ratio:
results[key] = results[key].rescale(results['scale'])
else:
results[key] = results[key].resize(results['img_shape'][:2])
def _resize_seg(self, results):
"""Resize semantic segmentation map with ``results['scale']``."""
for key in results.get('seg_fields', []):
if self.keep_ratio:
gt_seg = mmcv.imrescale(
results[key],
results['scale'],
interpolation='nearest',
backend=self.backend)
else:
gt_seg = mmcv.imresize(
results[key],
results['scale'],
interpolation='nearest',
backend=self.backend)
results['gt_semantic_seg'] = gt_seg
def __call__(self, results):
"""Call function to resize images, bounding boxes, masks, semantic
segmentation map.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Resized results, 'img_shape', 'pad_shape', 'scale_factor', \
'keep_ratio' keys are added into result dict.
"""
if 'scale' not in results:
if 'scale_factor' in results:
img_shape = results['img'].shape[:2]
scale_factor = results['scale_factor']
assert isinstance(scale_factor, float)
results['scale'] = tuple(
[int(x * scale_factor) for x in img_shape][::-1])
else:
self._random_scale(results)
else:
if not self.override:
assert 'scale_factor' not in results, (
'scale and scale_factor cannot be both set.')
else:
results.pop('scale')
if 'scale_factor' in results:
results.pop('scale_factor')
self._random_scale(results)
self._resize_img(results)
self._resize_bboxes(results)
self._resize_keypoints(results)
self._resize_masks(results)
self._resize_seg(results)
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(img_scale={self.img_scale}, '
repr_str += f'multiscale_mode={self.multiscale_mode}, '
repr_str += f'ratio_range={self.ratio_range}, '
repr_str += f'keep_ratio={self.keep_ratio})'
repr_str += f'bbox_clip_border={self.bbox_clip_border})'
return repr_str
@PIPELINES.register_module()
class RandomFlipV2(object):
"""Flip the image & bbox & mask & kps.
If the input dict contains the key "flip", then the flag will be used,
otherwise it will be randomly decided by a ratio specified in the init
method.
When random flip is enabled, ``flip_ratio``/``direction`` can either be a
float/string or tuple of float/string. There are 3 flip modes:
- ``flip_ratio`` is float, ``direction`` is string: the image will be
``direction``ly flipped with probability of ``flip_ratio`` .
E.g., ``flip_ratio=0.5``, ``direction='horizontal'``,
then image will be horizontally flipped with probability of 0.5.
- ``flip_ratio`` is float, ``direction`` is list of string: the image wil
be ``direction[i]``ly flipped with probability of
``flip_ratio/len(direction)``.
E.g., ``flip_ratio=0.5``, ``direction=['horizontal', 'vertical']``,
then image will be horizontally flipped with probability of 0.25,
vertically with probability of 0.25.
- ``flip_ratio`` is list of float, ``direction`` is list of string:
given ``len(flip_ratio) == len(direction)``, the image wil
be ``direction[i]``ly flipped with probability of ``flip_ratio[i]``.
E.g., ``flip_ratio=[0.3, 0.5]``, ``direction=['horizontal',
'vertical']``, then image will be horizontally flipped with probability
of 0.3, vertically with probability of 0.5
Args:
flip_ratio (float | list[float], optional): The flipping probability.
Default: None.
direction(str | list[str], optional): The flipping direction. Options
are 'horizontal', 'vertical', 'diagonal'. Default: 'horizontal'.
If input is a list, the length must equal ``flip_ratio``. Each
element in ``flip_ratio`` indicates the flip probability of
corresponding direction.
"""
def __init__(self, flip_ratio=None, direction='horizontal'):
if isinstance(flip_ratio, list):
assert mmcv.is_list_of(flip_ratio, float)
assert 0 <= sum(flip_ratio) <= 1
elif isinstance(flip_ratio, float):
assert 0 <= flip_ratio <= 1
elif flip_ratio is None:
pass
else:
raise ValueError('flip_ratios must be None, float, '
'or list of float')
self.flip_ratio = flip_ratio
valid_directions = ['horizontal', 'vertical', 'diagonal']
if isinstance(direction, str):
assert direction in valid_directions
elif isinstance(direction, list):
assert mmcv.is_list_of(direction, str)
assert set(direction).issubset(set(valid_directions))
else:
raise ValueError('direction must be either str or list of str')
self.direction = direction
if isinstance(flip_ratio, list):
assert len(self.flip_ratio) == len(self.direction)
self.count = 0
def bbox_flip(self, bboxes, img_shape, direction):
"""Flip bboxes horizontally.
Args:
bboxes (numpy.ndarray): Bounding boxes, shape (..., 4*k)
img_shape (tuple[int]): Image shape (height, width)
direction (str): Flip direction. Options are 'horizontal',
'vertical'.
Returns:
numpy.ndarray: Flipped bounding boxes.
"""
assert bboxes.shape[-1] % 4 == 0
flipped = bboxes.copy()
if direction == 'horizontal':
w = img_shape[1]
flipped[..., 0::4] = w - bboxes[..., 2::4]
flipped[..., 2::4] = w - bboxes[..., 0::4]
elif direction == 'vertical':
h = img_shape[0]
flipped[..., 1::4] = h - bboxes[..., 3::4]
flipped[..., 3::4] = h - bboxes[..., 1::4]
elif direction == 'diagonal':
w = img_shape[1]
h = img_shape[0]
flipped[..., 0::4] = w - bboxes[..., 2::4]
flipped[..., 1::4] = h - bboxes[..., 3::4]
flipped[..., 2::4] = w - bboxes[..., 0::4]
flipped[..., 3::4] = h - bboxes[..., 1::4]
else:
raise ValueError(f"Invalid flipping direction '{direction}'")
return flipped
def keypoints_flip(self, keypointss, img_shape, direction):
"""Flip keypoints horizontally."""
assert direction == 'horizontal'
assert keypointss.shape[-1] == 3
num_kps = keypointss.shape[1]
assert num_kps in [4, 5], f'Only Support num_kps=4 or 5, got:{num_kps}'
assert keypointss.ndim == 3
flipped = keypointss.copy()
if num_kps == 5:
flip_order = [1, 0, 2, 4, 3]
elif num_kps == 4:
flip_order = [3, 2, 1, 0]
for idx, a in enumerate(flip_order):
flipped[:, idx, :] = keypointss[:, a, :]
w = img_shape[1]
flipped[..., 0] = w - flipped[..., 0]
return flipped
def __call__(self, results):
"""Call function to flip bounding boxes, masks, semantic segmentation
maps.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Flipped results, 'flip', 'flip_direction' keys are added \
into result dict.
"""
if 'flip' not in results:
if isinstance(self.direction, list):
# None means non-flip
direction_list = self.direction + [None]
else:
# None means non-flip
direction_list = [self.direction, None]
if isinstance(self.flip_ratio, list):
non_flip_ratio = 1 - sum(self.flip_ratio)
flip_ratio_list = self.flip_ratio + [non_flip_ratio]
else:
non_flip_ratio = 1 - self.flip_ratio
# exclude non-flip
single_ratio = self.flip_ratio / (len(direction_list) - 1)
flip_ratio_list = [single_ratio] * (len(direction_list)
- 1) + [non_flip_ratio]
cur_dir = np.random.choice(direction_list, p=flip_ratio_list)
results['flip'] = cur_dir is not None
if 'flip_direction' not in results:
results['flip_direction'] = cur_dir
if results['flip']:
# flip image
for key in results.get('img_fields', ['img']):
results[key] = mmcv.imflip(
results[key], direction=results['flip_direction'])
# flip bboxes
for key in results.get('bbox_fields', []):
results[key] = self.bbox_flip(results[key],
results['img_shape'],
results['flip_direction'])
# flip kps
for key in results.get('keypoints_fields', []):
results[key] = self.keypoints_flip(results[key],
results['img_shape'],
results['flip_direction'])
# flip masks
for key in results.get('mask_fields', []):
results[key] = results[key].flip(results['flip_direction'])
# flip segs
for key in results.get('seg_fields', []):
results[key] = mmcv.imflip(
results[key], direction=results['flip_direction'])
return results
def __repr__(self):
return self.__class__.__name__ + f'(flip_ratio={self.flip_ratio})'
@PIPELINES.register_module()
class RandomSquareCrop(object):
"""Random crop the image & bboxes, the cropped patches have minimum IoU
requirement with original image & bboxes, the IoU threshold is randomly
selected from min_ious.
Args:
min_ious (tuple): minimum IoU threshold for all intersections with
bounding boxes
min_crop_size (float): minimum crop's size (i.e. h,w := a*h, a*w,
where a >= min_crop_size).
Note:
The keys for bboxes, labels and masks should be paired. That is, \
`gt_bboxes` corresponds to `gt_labels` and `gt_masks`, and \
`gt_bboxes_ignore` to `gt_labels_ignore` and `gt_masks_ignore`.
"""
def __init__(self,
crop_ratio_range=None,
crop_choice=None,
bbox_clip_border=True,
big_face_ratio=0,
big_face_crop_choice=None):
self.crop_ratio_range = crop_ratio_range
self.crop_choice = crop_choice
self.big_face_crop_choice = big_face_crop_choice
self.bbox_clip_border = bbox_clip_border
assert (self.crop_ratio_range is None) ^ (self.crop_choice is None)
if self.crop_ratio_range is not None:
self.crop_ratio_min, self.crop_ratio_max = self.crop_ratio_range
self.bbox2label = {
'gt_bboxes': 'gt_labels',
'gt_bboxes_ignore': 'gt_labels_ignore'
}
self.bbox2mask = {
'gt_bboxes': 'gt_masks',
'gt_bboxes_ignore': 'gt_masks_ignore'
}
assert big_face_ratio >= 0 and big_face_ratio <= 1.0
self.big_face_ratio = big_face_ratio
def __call__(self, results):
"""Call function to crop images and bounding boxes with minimum IoU
constraint.
Args:
results (dict): Result dict from loading pipeline.
Returns:
dict: Result dict with images and bounding boxes cropped, \
'img_shape' key is updated.
"""
if 'img_fields' in results:
assert results['img_fields'] == ['img'], \
'Only single img_fields is allowed'
img = results['img']
assert 'bbox_fields' in results
assert 'gt_bboxes' in results
# try augment big face images
find_bigface = False
if np.random.random() < self.big_face_ratio:
min_size = 100 # h and w
expand_ratio = 0.3 # expand ratio of croped face alongwith both w and h
bbox = results['gt_bboxes'].copy()
lmks = results['gt_keypointss'].copy()
label = results['gt_labels'].copy()
# filter small faces
size_mask = ((bbox[:, 2] - bbox[:, 0]) > min_size) * (
(bbox[:, 3] - bbox[:, 1]) > min_size)
bbox = bbox[size_mask]
lmks = lmks[size_mask]
label = label[size_mask]
# randomly choose a face that has no overlap with others
if len(bbox) > 0:
overlaps = bbox_overlaps(bbox, bbox)
overlaps -= np.eye(overlaps.shape[0])
iou_mask = np.sum(overlaps, axis=1) == 0
bbox = bbox[iou_mask]
lmks = lmks[iou_mask]
label = label[iou_mask]
if len(bbox) > 0:
choice = np.random.randint(len(bbox))
bbox = bbox[choice]
lmks = lmks[choice]
label = [label[choice]]
w = bbox[2] - bbox[0]
h = bbox[3] - bbox[1]
x1 = bbox[0] - w * expand_ratio
x2 = bbox[2] + w * expand_ratio
y1 = bbox[1] - h * expand_ratio
y2 = bbox[3] + h * expand_ratio
x1, x2 = np.clip([x1, x2], 0, img.shape[1])
y1, y2 = np.clip([y1, y2], 0, img.shape[0])
bbox -= np.tile([x1, y1], 2)
lmks -= (x1, y1, 0)
find_bigface = True
img = img[int(y1):int(y2), int(x1):int(x2), :]
results['gt_bboxes'] = np.expand_dims(bbox, axis=0)
results['gt_keypointss'] = np.expand_dims(lmks, axis=0)
results['gt_labels'] = np.array(label)
results['img'] = img
boxes = results['gt_bboxes']
h, w, c = img.shape
if self.crop_ratio_range is not None:
max_scale = self.crop_ratio_max
else:
max_scale = np.amax(self.crop_choice)
scale_retry = 0
while True:
scale_retry += 1
if scale_retry == 1 or max_scale > 1.0:
if self.crop_ratio_range is not None:
scale = np.random.uniform(self.crop_ratio_min,
self.crop_ratio_max)
elif self.crop_choice is not None:
scale = np.random.choice(self.crop_choice)
else:
scale = scale * 1.2
if find_bigface:
# select a scale from big_face_crop_choice if in big_face mode
scale = np.random.choice(self.big_face_crop_choice)
for i in range(250):
long_side = max(w, h)
cw = int(scale * long_side)
ch = cw
# TODO +1
if w == cw:
left = 0
elif w > cw:
left = random.randint(0, w - cw)
else:
left = random.randint(w - cw, 0)
if h == ch:
top = 0
elif h > ch:
top = random.randint(0, h - ch)
else:
top = random.randint(h - ch, 0)
patch = np.array(
(int(left), int(top), int(left + cw), int(top + ch)),
dtype=np.int32)
# center of boxes should inside the crop img
# only adjust boxes and instance masks when the gt is not empty
# adjust boxes
def is_center_of_bboxes_in_patch(boxes, patch):
# TODO >=
center = (boxes[:, :2] + boxes[:, 2:]) / 2
mask = \
((center[:, 0] > patch[0])
* (center[:, 1] > patch[1])
* (center[:, 0] < patch[2])
* (center[:, 1] < patch[3]))
return mask
mask = is_center_of_bboxes_in_patch(boxes, patch)
if not mask.any():
continue
for key in results.get('bbox_fields', []):
boxes = results[key].copy()
mask = is_center_of_bboxes_in_patch(boxes, patch)
boxes = boxes[mask]
if self.bbox_clip_border:
boxes[:, 2:] = boxes[:, 2:].clip(max=patch[2:])
boxes[:, :2] = boxes[:, :2].clip(min=patch[:2])
boxes -= np.tile(patch[:2], 2)
results[key] = boxes
# labels
label_key = self.bbox2label.get(key)
if label_key in results:
results[label_key] = results[label_key][mask]
# keypoints field
if key == 'gt_bboxes':
for kps_key in results.get('keypoints_fields', []):
keypointss = results[kps_key].copy()
keypointss = keypointss[mask, :, :]
if self.bbox_clip_border:
keypointss[:, :, :
2] = keypointss[:, :, :2].clip(
max=patch[2:])
keypointss[:, :, :
2] = keypointss[:, :, :2].clip(
min=patch[:2])
keypointss[:, :, 0] -= patch[0]
keypointss[:, :, 1] -= patch[1]
results[kps_key] = keypointss
# mask fields
mask_key = self.bbox2mask.get(key)
if mask_key in results:
results[mask_key] = results[mask_key][mask.nonzero()
[0]].crop(patch)
# adjust the img no matter whether the gt is empty before crop
rimg = np.ones((ch, cw, 3), dtype=img.dtype) * 128
patch_from = patch.copy()
patch_from[0] = max(0, patch_from[0])
patch_from[1] = max(0, patch_from[1])
patch_from[2] = min(img.shape[1], patch_from[2])
patch_from[3] = min(img.shape[0], patch_from[3])
patch_to = patch.copy()
patch_to[0] = max(0, patch_to[0] * -1)
patch_to[1] = max(0, patch_to[1] * -1)
patch_to[2] = patch_to[0] + (patch_from[2] - patch_from[0])
patch_to[3] = patch_to[1] + (patch_from[3] - patch_from[1])
rimg[patch_to[1]:patch_to[3],
patch_to[0]:patch_to[2], :] = img[
patch_from[1]:patch_from[3],
patch_from[0]:patch_from[2], :]
img = rimg
results['img'] = img
results['img_shape'] = img.shape
return results
def __repr__(self):
repr_str = self.__class__.__name__
repr_str += f'(min_ious={self.min_iou}, '
repr_str += f'crop_size={self.crop_size})'
return repr_str

View File

@@ -13,7 +13,7 @@ class RetinaFaceDataset(CustomDataset):
CLASSES = ('FG', )
def __init__(self, min_size=None, **kwargs):
self.NK = 5
self.NK = kwargs.pop('num_kps', 5)
self.cat2label = {cat: i for i, cat in enumerate(self.CLASSES)}
self.min_size = min_size
self.gt_path = kwargs.get('gt_path')
@@ -33,7 +33,8 @@ class RetinaFaceDataset(CustomDataset):
if len(values) > 4:
if len(values) > 5:
kps = np.array(
values[4:19], dtype=np.float32).reshape((self.NK, 3))
values[4:4 + self.NK * 3], dtype=np.float32).reshape(
(self.NK, 3))
for li in range(kps.shape[0]):
if (kps[li, :] == -1).all():
kps[li][2] = 0.0 # weight = 0, ignore

View File

@@ -103,6 +103,7 @@ class SCRFDHead(AnchorHead):
scale_mode=1,
dw_conv=False,
use_kps=False,
num_kps=5,
loss_kps=dict(
type='SmoothL1Loss', beta=1.0 / 9.0, loss_weight=0.1),
**kwargs):
@@ -116,7 +117,7 @@ class SCRFDHead(AnchorHead):
self.scale_mode = scale_mode
self.use_dfl = True
self.dw_conv = dw_conv
self.NK = 5
self.NK = num_kps
self.extra_flops = 0.0
if loss_dfl is None or not loss_dfl:
self.use_dfl = False
@@ -323,8 +324,8 @@ class SCRFDHead(AnchorHead):
batch_size, -1, self.cls_out_channels).sigmoid()
bbox_pred = bbox_pred.permute(0, 2, 3,
1).reshape(batch_size, -1, 4)
kps_pred = kps_pred.permute(0, 2, 3, 1).reshape(batch_size, -1, 10)
kps_pred = kps_pred.permute(0, 2, 3,
1).reshape(batch_size, -1, self.NK * 2)
return cls_score, bbox_pred, kps_pred
def forward_train(self,
@@ -788,7 +789,7 @@ class SCRFDHead(AnchorHead):
if self.use_dfl:
kps_pred = self.integral(kps_pred) * stride[0]
else:
kps_pred = kps_pred.reshape((-1, 10)) * stride[0]
kps_pred = kps_pred.reshape((-1, self.NK * 2)) * stride[0]
nms_pre = cfg.get('nms_pre', -1)
if nms_pre > 0 and scores.shape[0] > nms_pre:
@@ -815,7 +816,7 @@ class SCRFDHead(AnchorHead):
mlvl_bboxes /= mlvl_bboxes.new_tensor(scale_factor)
if mlvl_kps is not None:
scale_factor2 = torch.tensor(
[scale_factor[0], scale_factor[1]] * 5)
[scale_factor[0], scale_factor[1]] * self.NK)
mlvl_kps /= scale_factor2.to(mlvl_kps.device)
mlvl_scores = torch.cat(mlvl_scores)

View File

@@ -54,7 +54,13 @@ class SCRFD(SingleStageDetector):
gt_bboxes_ignore)
return losses
def simple_test(self, img, img_metas, rescale=False):
def simple_test(self,
img,
img_metas,
rescale=False,
repeat_head=1,
output_kps_var=0,
output_results=1):
"""Test function without test time augmentation.
Args:
@@ -62,6 +68,9 @@ class SCRFD(SingleStageDetector):
img_metas (list[dict]): List of image information.
rescale (bool, optional): Whether to rescale the results.
Defaults to False.
repeat_head (int): repeat inference times in head
output_kps_var (int): whether output kps var to calculate quality
output_results (int): 0: nothing 1: bbox 2: both bbox and kps
Returns:
list[list[np.ndarray]]: BBox results of each image and classes.
@@ -69,40 +78,71 @@ class SCRFD(SingleStageDetector):
corresponds to each class.
"""
x = self.extract_feat(img)
outs = self.bbox_head(x)
if torch.onnx.is_in_onnx_export():
print('single_stage.py in-onnx-export')
print(outs.__class__)
cls_score, bbox_pred, kps_pred = outs
for c in cls_score:
print(c.shape)
for c in bbox_pred:
print(c.shape)
if self.bbox_head.use_kps:
for c in kps_pred:
print(c.shape)
return (cls_score, bbox_pred, kps_pred)
else:
return (cls_score, bbox_pred)
bbox_list = self.bbox_head.get_bboxes(
*outs, img_metas, rescale=rescale)
assert repeat_head >= 1
kps_out0 = []
kps_out1 = []
kps_out2 = []
for i in range(repeat_head):
outs = self.bbox_head(x)
kps_out0 += [outs[2][0].detach().cpu().numpy()]
kps_out1 += [outs[2][1].detach().cpu().numpy()]
kps_out2 += [outs[2][2].detach().cpu().numpy()]
if output_kps_var:
var0 = np.var(np.vstack(kps_out0), axis=0).mean()
var1 = np.var(np.vstack(kps_out1), axis=0).mean()
var2 = np.var(np.vstack(kps_out2), axis=0).mean()
var = np.mean([var0, var1, var2])
else:
var = None
# return kps if use_kps
if len(bbox_list[0]) == 2:
bbox_results = [
bbox2result(det_bboxes, det_labels, self.bbox_head.num_classes)
for det_bboxes, det_labels in bbox_list
]
elif len(bbox_list[0]) == 3:
bbox_results = [
bbox2result(
det_bboxes,
det_labels,
self.bbox_head.num_classes,
kps=det_kps)
for det_bboxes, det_labels, det_kps in bbox_list
]
return bbox_results
if output_results > 0:
if torch.onnx.is_in_onnx_export():
print('single_stage.py in-onnx-export')
print(outs.__class__)
cls_score, bbox_pred, kps_pred = outs
for c in cls_score:
print(c.shape)
for c in bbox_pred:
print(c.shape)
if self.bbox_head.use_kps:
for c in kps_pred:
print(c.shape)
return (cls_score, bbox_pred, kps_pred)
else:
return (cls_score, bbox_pred)
bbox_list = self.bbox_head.get_bboxes(
*outs, img_metas, rescale=rescale)
# return kps if use_kps
if len(bbox_list[0]) == 2:
bbox_results = [
bbox2result(det_bboxes, det_labels,
self.bbox_head.num_classes)
for det_bboxes, det_labels in bbox_list
]
elif len(bbox_list[0]) == 3:
if output_results == 2:
bbox_results = [
bbox2result(
det_bboxes,
det_labels,
self.bbox_head.num_classes,
kps=det_kps,
num_kps=self.bbox_head.NK)
for det_bboxes, det_labels, det_kps in bbox_list
]
elif output_results == 1:
bbox_results = [
bbox2result(det_bboxes, det_labels,
self.bbox_head.num_classes)
for det_bboxes, det_labels, _ in bbox_list
]
else:
bbox_results = None
if var is not None:
return bbox_results, var
else:
return bbox_results
def feature_test(self, img):
x = self.extract_feat(img)

View File

@@ -0,0 +1,71 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path as osp
from copy import deepcopy
from typing import Any, Dict
import torch
from modelscope.metainfo import Models
from modelscope.models.base import TorchModel
from modelscope.models.builder import MODELS
from modelscope.outputs import OutputKeys
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
logger = get_logger()
__all__ = ['ScrfdDetect']
@MODELS.register_module(Tasks.face_detection, module_name=Models.scrfd)
class ScrfdDetect(TorchModel):
def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the face detection model from the `model_dir` path.
Args:
model_dir (str): the model path.
"""
super().__init__(model_dir, *args, **kwargs)
from mmcv import Config
from mmcv.parallel import MMDataParallel
from mmcv.runner import load_checkpoint
from mmdet.models import build_detector
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets import RetinaFaceDataset
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.datasets.pipelines import RandomSquareCrop
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.backbones import ResNetV1e
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.dense_heads import SCRFDHead
from modelscope.models.cv.face_detection.scrfd.mmdet_patch.models.detectors import SCRFD
cfg = Config.fromfile(osp.join(model_dir, 'mmcv_scrfd.py'))
ckpt_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
cfg.model.test_cfg.score_thr = kwargs.get('score_thr', 0.3)
detector = build_detector(cfg.model)
logger.info(f'loading model from {ckpt_path}')
device = torch.device(
f'cuda:{0}' if torch.cuda.is_available() else 'cpu')
load_checkpoint(detector, ckpt_path, map_location=device)
detector = MMDataParallel(detector, device_ids=[0])
detector.eval()
self.detector = detector
logger.info('load model done')
def forward(self, input: Dict[str, Any]) -> Dict[str, Any]:
result = self.detector(
return_loss=False,
rescale=True,
img=[input['img'][0].unsqueeze(0)],
img_metas=[[dict(input['img_metas'][0].data)]],
output_results=2)
assert result is not None
result = result[0][0]
bboxes = result[:, :4].tolist()
kpss = result[:, 5:].tolist()
scores = result[:, 4].tolist()
return {
OutputKeys.SCORES: scores,
OutputKeys.BOXES: bboxes,
OutputKeys.KEYPOINTS: kpss
}
def postprocess(self, input: Dict[str, Any], **kwargs) -> Dict[str, Any]:
return input

View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .hand_2d_keypoints import Hand2dKeyPoints
else:
_import_structure = {'hand_2d_keypoints': ['Hand2dKeyPoints']}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -0,0 +1,16 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.pose import TopDown
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.hand_2d_keypoints, module_name=Models.hand_2d_keypoints)
class Hand2dKeyPoints(EasyCVBaseModel, TopDown):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
TopDown.__init__(self, *args, **kwargs)

View File

@@ -0,0 +1,22 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .human_wholebody_keypoint import HumanWholeBodyKeypoint
else:
_import_structure = {
'human_wholebody_keypoint': ['HumanWholeBodyKeypoint']
}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -0,0 +1,17 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from easycv.models.pose.top_down import TopDown
from modelscope.metainfo import Models
from modelscope.models.builder import MODELS
from modelscope.models.cv.easycv_base import EasyCVBaseModel
from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.human_wholebody_keypoint,
module_name=Models.human_wholebody_keypoint)
class HumanWholeBodyKeypoint(EasyCVBaseModel, TopDown):
def __init__(self, model_dir=None, *args, **kwargs):
EasyCVBaseModel.__init__(self, model_dir, args, kwargs)
TopDown.__init__(self, *args, **kwargs)

View File

@@ -0,0 +1,20 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .image_body_reshaping import ImageBodyReshaping
else:
_import_structure = {'image_body_reshaping': ['ImageBodyReshaping']}
import sys
sys.modules[__name__] = LazyImportModule(
__name__,
globals()['__file__'],
_import_structure,
module_spec=__spec__,
extra_objects={},
)

View File

@@ -0,0 +1,128 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict
import cv2
import numpy as np
import torch
from modelscope.metainfo import Models
from modelscope.models.base import Tensor, TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
from .model import FlowGenerator
from .person_info import PersonInfo
from .pose_estimator.body import Body
from .slim_utils import image_warp_grid1, resize_on_long_side
logger = get_logger()
__all__ = ['ImageBodyReshaping']
@MODELS.register_module(
Tasks.image_body_reshaping, module_name=Models.image_body_reshaping)
class ImageBodyReshaping(TorchModel):
def __init__(self, model_dir: str, *args, **kwargs):
"""initialize the image body reshaping model from the `model_dir` path.
Args:
model_dir (str): the model path.
"""
super().__init__(model_dir, *args, **kwargs)
if torch.cuda.is_available():
self.device = torch.device('cuda')
else:
self.device = torch.device('cpu')
self.degree = 1.0
self.reshape_model = FlowGenerator(n_channels=16).to(self.device)
model_path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
checkpoints = torch.load(model_path, map_location=torch.device('cpu'))
self.reshape_model.load_state_dict(
checkpoints['state_dict'], strict=True)
self.reshape_model.eval()
logger.info('load body reshaping model done')
pose_model_ckpt = os.path.join(model_dir, 'body_pose_model.pth')
self.pose_esti = Body(pose_model_ckpt, self.device)
logger.info('load pose model done')
def pred_joints(self, img):
if img is None:
return None
small_src, resize_scale = resize_on_long_side(img, 300)
body_joints = self.pose_esti(small_src)
if body_joints.shape[0] >= 1:
body_joints[:, :, :2] = body_joints[:, :, :2] / resize_scale
return body_joints
def pred_flow(self, img):
body_joints = self.pred_joints(img)
small_size = 1200
if img.shape[0] > small_size or img.shape[1] > small_size:
_img, _scale = resize_on_long_side(img, small_size)
body_joints[:, :, :2] = body_joints[:, :, :2] * _scale
else:
_img = img
# We only reshape one person
if body_joints.shape[0] < 1 or body_joints.shape[0] > 1:
return None
person = PersonInfo(body_joints[0])
with torch.no_grad():
person_pred = person.pred_flow(_img, self.reshape_model,
self.device)
flow = np.dstack((person_pred['rDx'], person_pred['rDy']))
scale = img.shape[0] * 1.0 / flow.shape[0]
flow = cv2.resize(flow, (img.shape[1], img.shape[0]))
flow *= scale
return flow
def warp(self, src_img, flow):
X_flow = flow[..., 0]
Y_flow = flow[..., 1]
X_flow = np.ascontiguousarray(X_flow)
Y_flow = np.ascontiguousarray(Y_flow)
pred = image_warp_grid1(X_flow, Y_flow, src_img, 1.0, 0, 0)
return pred
def inference(self, img):
img = img.cpu().numpy()
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
flow = self.pred_flow(img)
if flow is None:
return img
assert flow.shape[:2] == img.shape[:2]
mag, ang = cv2.cartToPolar(flow[..., 0] + 1e-8, flow[..., 1] + 1e-8)
mag -= 3
mag[mag <= 0] = 0
x, y = cv2.polarToCart(mag, ang, angleInDegrees=False)
flow = np.dstack((x, y))
flow *= self.degree
pred = self.warp(img, flow)
out_img = np.clip(pred, 0, 255)
logger.info('model inference done')
return out_img.astype(np.uint8)

View File

@@ -0,0 +1,189 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import torch
import torch.nn as nn
import torch.nn.functional as F
class ConvLayer(nn.Module):
def __init__(self, in_ch, out_ch):
super(ConvLayer, self).__init__()
self.conv = nn.Sequential(
nn.ReflectionPad2d(1),
nn.Conv2d(in_ch, out_ch, kernel_size=3, padding=0),
nn.BatchNorm2d(out_ch), nn.ReLU(inplace=True))
def forward(self, x):
x = self.conv(x)
return x
class SASA(nn.Module):
def __init__(self, in_dim):
super(SASA, self).__init__()
self.chanel_in = in_dim
self.query_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
self.key_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
self.value_conv = nn.Conv2d(
in_channels=in_dim, out_channels=in_dim, kernel_size=1)
self.mag_conv = nn.Conv2d(
in_channels=5, out_channels=in_dim // 32, kernel_size=1)
self.gamma = nn.Parameter(torch.zeros(1))
self.softmax = nn.Softmax(dim=-1) #
self.sigmoid = nn.Sigmoid()
def structure_encoder(self, paf_mag, target_height, target_width):
torso_mask = torch.sum(paf_mag[:, 1:3, :, :], dim=1, keepdim=True)
torso_mask = torch.clamp(torso_mask, 0, 1)
arms_mask = torch.sum(paf_mag[:, 4:8, :, :], dim=1, keepdim=True)
arms_mask = torch.clamp(arms_mask, 0, 1)
legs_mask = torch.sum(paf_mag[:, 8:12, :, :], dim=1, keepdim=True)
legs_mask = torch.clamp(legs_mask, 0, 1)
fg_mask = paf_mag[:, 12, :, :].unsqueeze(1)
bg_mask = 1 - fg_mask
Y = torch.cat((arms_mask, torso_mask, legs_mask, fg_mask, bg_mask),
dim=1)
Y = F.interpolate(Y, size=(target_height, target_width), mode='area')
return Y
def forward(self, X, PAF_mag):
"""extract self-attention features.
Args:
X : input feature maps( B x C x H x W)
PAF_mag : ( B x C x H x W), 1 denotes connectivity, 0 denotes non-connectivity
Returns:
out : self attention value + input feature
Y: B X N X N (N is Width*Height)
"""
m_batchsize, C, height, width = X.size()
Y = self.structure_encoder(PAF_mag, height, width)
connectivity_mask_vec = self.mag_conv(Y).view(m_batchsize, -1,
width * height)
affinity = torch.bmm(
connectivity_mask_vec.permute(0, 2, 1), connectivity_mask_vec)
affinity_centered = affinity - torch.mean(affinity)
affinity_sigmoid = self.sigmoid(affinity_centered)
proj_query = self.query_conv(X).view(m_batchsize, -1,
width * height).permute(0, 2, 1)
proj_key = self.key_conv(X).view(m_batchsize, -1, width * height)
selfatten_map = torch.bmm(proj_query, proj_key)
selfatten_centered = selfatten_map - torch.mean(
selfatten_map) # centering
selfatten_sigmoid = self.sigmoid(selfatten_centered)
SASA_map = selfatten_sigmoid * affinity_sigmoid
proj_value = self.value_conv(X).view(m_batchsize, -1, width * height)
out = torch.bmm(proj_value, SASA_map.permute(0, 2, 1))
out = out.view(m_batchsize, C, height, width)
out = self.gamma * out + X
return out, Y
class FlowGenerator(nn.Module):
def __init__(self, n_channels, deep_supervision=False):
super(FlowGenerator, self).__init__()
self.deep_supervision = deep_supervision
self.Encoder = nn.Sequential(
ConvLayer(n_channels, 64),
ConvLayer(64, 64),
nn.MaxPool2d(2),
ConvLayer(64, 128),
ConvLayer(128, 128),
nn.MaxPool2d(2),
ConvLayer(128, 256),
ConvLayer(256, 256),
nn.MaxPool2d(2),
ConvLayer(256, 512),
ConvLayer(512, 512),
nn.MaxPool2d(2),
ConvLayer(512, 1024),
ConvLayer(1024, 1024),
ConvLayer(1024, 1024),
ConvLayer(1024, 1024),
ConvLayer(1024, 1024),
)
self.SASA = SASA(in_dim=1024)
self.Decoder = nn.Sequential(
ConvLayer(1024, 1024),
nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
ConvLayer(1024, 512),
ConvLayer(512, 512),
nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True),
ConvLayer(512, 256),
ConvLayer(256, 256),
ConvLayer(256, 128),
ConvLayer(128, 64),
ConvLayer(64, 32),
nn.Conv2d(32, 2, kernel_size=1, padding=0),
nn.Tanh(),
nn.Upsample(scale_factor=4, mode='bilinear', align_corners=True),
)
dilation_ksize = 17
self.dilation = torch.nn.MaxPool2d(
kernel_size=dilation_ksize,
stride=1,
padding=int((dilation_ksize - 1) / 2))
def warp(self, x, flow, mode='bilinear', padding_mode='zeros', coff=0.2):
n, c, h, w = x.size()
yv, xv = torch.meshgrid([torch.arange(h), torch.arange(w)])
xv = xv.float() / (w - 1) * 2.0 - 1
yv = yv.float() / (h - 1) * 2.0 - 1
grid = torch.cat((xv.unsqueeze(-1), yv.unsqueeze(-1)), -1).unsqueeze(0)
grid = grid.to(flow.device)
grid_x = grid + 2 * flow * coff
warp_x = F.grid_sample(x, grid_x, mode=mode, padding_mode=padding_mode)
return warp_x
def forward(self, img, skeleton_map, coef=0.2):
"""extract self-attention features.
Args:
img : input numpy image
skeleton_map : skeleton map of input image
coef: warp degree
Returns:
warp_x : warped image
flow: predicted flow
"""
img_concat = torch.cat((img, skeleton_map), dim=1)
X = self.Encoder(img_concat)
_, _, height, width = X.size()
# directly get PAF magnitude from skeleton maps via dilation
PAF_mag = self.dilation((skeleton_map + 1.0) * 0.5)
out, Y = self.SASA(X, PAF_mag)
flow = self.Decoder(out)
flow = flow.permute(0, 2, 3, 1) # [n, 2, h, w] ==> [n, h, w, 2]
warp_x = self.warp(img, flow, coff=coef)
warp_x = torch.clamp(warp_x, min=-1.0, max=1.0)
return warp_x, flow

View File

@@ -0,0 +1,339 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import copy
import cv2
import numpy as np
import torch
from .slim_utils import (enlarge_box_tblr, gen_skeleton_map,
get_map_fusion_map_cuda, get_mask_bbox,
resize_on_long_side)
class PersonInfo(object):
def __init__(self, joints):
self.joints = joints
self.flow = None
self.pad_boder = False
self.height_expand = 0
self.width_expand = 0
self.coeff = 0.2
self.network_input_W = 256
self.network_input_H = 256
self.divider = 20
self.flow_scales = ['upper_2']
def update_attribute(self, pad_boder, height_expand, width_expand):
self.pad_boder = pad_boder
self.height_expand = height_expand
self.width_expand = width_expand
if pad_boder:
self.joints[:, 0] += width_expand
self.joints[:, 1] += height_expand
def pred_flow(self, img, flow_net, device):
with torch.no_grad():
if img is None:
print('image is none')
self.flow = None
if len(img.shape) == 2:
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
if self.pad_boder:
height_expand = self.height_expand
width_expand = self.width_expand
pad_img = cv2.copyMakeBorder(
img,
height_expand,
height_expand,
width_expand,
width_expand,
cv2.BORDER_CONSTANT,
value=(127, 127, 127))
else:
height_expand = 0
width_expand = 0
pad_img = img.copy()
canvas = np.zeros(
shape=(pad_img.shape[0], pad_img.shape[1]), dtype=np.float32)
self.human_joint_box = self.__joint_to_body_box()
self.human_box = enlarge_box_tblr(
self.human_joint_box, pad_img, ratio=0.25)
human_box_height = self.human_box[1] - self.human_box[0]
human_box_width = self.human_box[3] - self.human_box[2]
self.leg_joint_box = self.__joint_to_leg_box()
self.leg_box = enlarge_box_tblr(
self.leg_joint_box, pad_img, ratio=0.25)
self.arm_joint_box = self.__joint_to_arm_box()
self.arm_box = enlarge_box_tblr(
self.arm_joint_box, pad_img, ratio=0.1)
x_flows = []
y_flows = []
multi_bbox = []
for scale in self.flow_scales: # better for metric
scale_value = float(scale.split('_')[-1])
arm_box = copy.deepcopy(self.arm_box)
if arm_box[0] is None:
arm_box = self.human_box
arm_box_height = arm_box[1] - arm_box[0]
arm_box_width = arm_box[3] - arm_box[2]
roi_bbox = None
if arm_box_width < human_box_width * 0.1 or arm_box_height < human_box_height * 0.1:
roi_bbox = self.human_box
else:
arm_box = enlarge_box_tblr(
arm_box, pad_img, ratio=scale_value)
if scale == 'upper_0.2':
arm_box[0] = min(arm_box[0], int(self.joints[0][1]))
if scale.startswith('upper'):
roi_bbox = [
max(self.human_box[0], arm_box[0]),
min(self.human_box[1], arm_box[1]),
max(self.human_box[2], arm_box[2]),
min(self.human_box[3], arm_box[3])
]
if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
3] - roi_bbox[2] < 1:
continue
elif scale.startswith('lower'):
roi_bbox = [
max(self.human_box[0], self.leg_box[0]),
min(self.human_box[1], self.leg_box[1]),
max(self.human_box[2], self.leg_box[2]),
min(self.human_box[3], self.leg_box[3])
]
if roi_bbox[1] - roi_bbox[0] < 1 or roi_bbox[
3] - roi_bbox[2] < 1:
continue
skel_map, roi_bbox = gen_skeleton_map(
self.joints, 'depth', input_roi_box=roi_bbox)
if roi_bbox is None:
continue
if skel_map.dtype != np.float32:
skel_map = skel_map.astype(np.float32)
skel_map -= 1.0 # [0,2] ->[-1,1]
multi_bbox.append(roi_bbox)
roi_bbox_height = roi_bbox[1] - roi_bbox[0]
roi_bbox_width = roi_bbox[3] - roi_bbox[2]
assert skel_map.shape[0] == roi_bbox_height
assert skel_map.shape[1] == roi_bbox_width
roi_height_pad = roi_bbox_height // self.divider
roi_width_pad = roi_bbox_width // self.divider
paded_roi_h = roi_bbox_height + 2 * roi_height_pad
paded_roi_w = roi_bbox_width + 2 * roi_width_pad
roi_height_pad_joint = skel_map.shape[0] // self.divider
roi_width_pad_joint = skel_map.shape[1] // self.divider
skel_map = np.pad(
skel_map,
((roi_height_pad_joint, roi_height_pad_joint),
(roi_width_pad_joint, roi_width_pad_joint), (0, 0)),
'constant',
constant_values=-1)
skel_map_resized = cv2.resize(
skel_map, (self.network_input_W, self.network_input_H))
skel_map_resized[skel_map_resized < 0] = -1.0
skel_map_resized[skel_map_resized > -0.5] = 1.0
skel_map_transformed = torch.from_numpy(
skel_map_resized.transpose((2, 0, 1)))
roi_npy = pad_img[roi_bbox[0]:roi_bbox[1],
roi_bbox[2]:roi_bbox[3], :].copy()
if roi_npy.dtype != np.float32:
roi_npy = roi_npy.astype(np.float32)
roi_npy = np.pad(roi_npy,
((roi_height_pad, roi_height_pad),
(roi_width_pad, roi_width_pad), (0, 0)),
'edge')
roi_npy = roi_npy[:, :, ::-1]
roi_npy = cv2.resize(
roi_npy, (self.network_input_W, self.network_input_H))
roi_npy *= 1.0 / 255
roi_npy -= 0.5
roi_npy *= 2
rgb_tensor = torch.from_numpy(roi_npy.transpose((2, 0, 1)))
rgb_tensor = rgb_tensor.unsqueeze(0).to(device)
skel_map_tensor = skel_map_transformed.unsqueeze(0).to(device)
warped_img_val, flow_field_val = flow_net(
rgb_tensor, skel_map_tensor
) # inference, connectivity_mask [1,12,16,16]
flow_field_val = flow_field_val.detach().squeeze().cpu().numpy(
)
flow_field_val = cv2.resize(
flow_field_val, (paded_roi_w, paded_roi_h),
interpolation=cv2.INTER_LINEAR)
flow_field_val[..., 0] = flow_field_val[
..., 0] * paded_roi_w * 0.5 * 2 * self.coeff
flow_field_val[..., 1] = flow_field_val[
..., 1] * paded_roi_h * 0.5 * 2 * self.coeff
# remove pad areas
flow_field_val = flow_field_val[
roi_height_pad:flow_field_val.shape[0] - roi_height_pad,
roi_width_pad:flow_field_val.shape[1] - roi_width_pad, :]
diffuse_width = max(roi_bbox_width // 3, 1)
diffuse_height = max(roi_bbox_height // 3, 1)
assert roi_bbox_width == flow_field_val.shape[1]
assert roi_bbox_height == flow_field_val.shape[0]
origin_flow = np.zeros(
(pad_img.shape[0] + 2 * diffuse_height,
pad_img.shape[1] + 2 * diffuse_width, 2),
dtype=np.float32)
flow_field_val = np.pad(flow_field_val,
((diffuse_height, diffuse_height),
(diffuse_width, diffuse_width),
(0, 0)), 'linear_ramp')
origin_flow[roi_bbox[0]:roi_bbox[1] + 2 * diffuse_height,
roi_bbox[2]:roi_bbox[3]
+ 2 * diffuse_width] = flow_field_val
origin_flow = origin_flow[diffuse_height:-diffuse_height,
diffuse_width:-diffuse_width, :]
x_flows.append(origin_flow[..., 0])
y_flows.append(origin_flow[..., 1])
if len(x_flows) == 0:
return {
'rDx': np.zeros(canvas.shape[:2], dtype=np.float32),
'rDy': np.zeros(canvas.shape[:2], dtype=np.float32),
'multi_bbox': multi_bbox,
'x_fusion_map':
np.ones(canvas.shape[:2], dtype=np.float32),
'y_fusion_map':
np.ones(canvas.shape[:2], dtype=np.float32)
}
else:
origin_rDx, origin_rDy, x_fusion_map, y_fusion_map = self.blend_multiscale_flow(
x_flows, y_flows, device=device)
return {
'rDx': origin_rDx,
'rDy': origin_rDy,
'multi_bbox': multi_bbox,
'x_fusion_map': x_fusion_map,
'y_fusion_map': y_fusion_map
}
@staticmethod
def blend_multiscale_flow(x_flows, y_flows, device=None):
scale_num = len(x_flows)
if scale_num == 1:
return x_flows[0], y_flows[0], np.ones_like(
x_flows[0]), np.ones_like(x_flows[0])
origin_rDx = np.zeros((x_flows[0].shape[0], x_flows[0].shape[1]),
dtype=np.float32)
origin_rDy = np.zeros((y_flows[0].shape[0], y_flows[0].shape[1]),
dtype=np.float32)
x_fusion_map, x_acc_map = get_map_fusion_map_cuda(
x_flows, 1, device=device)
y_fusion_map, y_acc_map = get_map_fusion_map_cuda(
y_flows, 1, device=device)
x_flow_map = 1.0 / x_fusion_map
y_flow_map = 1.0 / y_fusion_map
all_acc_map = x_acc_map + y_acc_map
all_acc_map = all_acc_map.astype(np.uint8)
roi_box = get_mask_bbox(all_acc_map, threshold=1)
if roi_box[0] is None or roi_box[1] - roi_box[0] <= 0 or roi_box[
3] - roi_box[2] <= 0:
roi_box = [0, x_flow_map.shape[0], 0, x_flow_map.shape[1]]
roi_x_flow_map = x_flow_map[roi_box[0]:roi_box[1],
roi_box[2]:roi_box[3]]
roi_y_flow_map = y_flow_map[roi_box[0]:roi_box[1],
roi_box[2]:roi_box[3]]
roi_width = roi_x_flow_map.shape[1]
roi_height = roi_x_flow_map.shape[0]
roi_x_flow_map, scale = resize_on_long_side(roi_x_flow_map, 320)
roi_y_flow_map, scale = resize_on_long_side(roi_y_flow_map, 320)
roi_x_flow_map = cv2.blur(roi_x_flow_map, (55, 55))
roi_y_flow_map = cv2.blur(roi_y_flow_map, (55, 55))
roi_x_flow_map = cv2.resize(roi_x_flow_map, (roi_width, roi_height))
roi_y_flow_map = cv2.resize(roi_y_flow_map, (roi_width, roi_height))
x_flow_map[roi_box[0]:roi_box[1],
roi_box[2]:roi_box[3]] = roi_x_flow_map
y_flow_map[roi_box[0]:roi_box[1],
roi_box[2]:roi_box[3]] = roi_y_flow_map
for i in range(scale_num):
origin_rDx += x_flows[i]
origin_rDy += y_flows[i]
origin_rDx *= x_flow_map
origin_rDy *= y_flow_map
return origin_rDx, origin_rDy, x_flow_map, y_flow_map
def __joint_to_body_box(self):
joint_left = int(np.min(self.joints, axis=0)[0])
joint_right = int(np.max(self.joints, axis=0)[0])
joint_top = int(np.min(self.joints, axis=0)[1])
joint_bottom = int(np.max(self.joints, axis=0)[1])
return [joint_top, joint_bottom, joint_left, joint_right]
def __joint_to_leg_box(self):
leg_joints = self.joints[8:, :]
if np.max(leg_joints, axis=0)[2] < 0.05:
return [0, 0, 0, 0]
joint_left = int(np.min(leg_joints, axis=0)[0])
joint_right = int(np.max(leg_joints, axis=0)[0])
joint_top = int(np.min(leg_joints, axis=0)[1])
joint_bottom = int(np.max(leg_joints, axis=0)[1])
return [joint_top, joint_bottom, joint_left, joint_right]
def __joint_to_arm_box(self):
arm_joints = self.joints[2:8, :]
if np.max(arm_joints, axis=0)[2] < 0.05:
return [0, 0, 0, 0]
joint_left = int(np.min(arm_joints, axis=0)[0])
joint_right = int(np.max(arm_joints, axis=0)[0])
joint_top = int(np.min(arm_joints, axis=0)[1])
joint_bottom = int(np.max(arm_joints, axis=0)[1])
return [joint_top, joint_bottom, joint_left, joint_right]

View File

@@ -0,0 +1,272 @@
# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
import math
import cv2
import numpy as np
import torch
from scipy.ndimage.filters import gaussian_filter
from .model import BodyposeModel
from .util import pad_rightdown_corner, transfer
class Body(object):
def __init__(self, model_path, device):
self.model = BodyposeModel().to(device)
model_dict = transfer(self.model, torch.load(model_path))
self.model.load_state_dict(model_dict)
self.model.eval()
def __call__(self, oriImg):
scale_search = [0.5]
boxsize = 368
stride = 8
padValue = 128
thre1 = 0.1
thre2 = 0.05
bodyparts = 18
multiplier = [x * boxsize / oriImg.shape[0] for x in scale_search]
heatmap_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 19))
paf_avg = np.zeros((oriImg.shape[0], oriImg.shape[1], 38))
for m in range(len(multiplier)):
scale = multiplier[m]
imageToTest = cv2.resize(
oriImg, (0, 0),
fx=scale,
fy=scale,
interpolation=cv2.INTER_CUBIC)
imageToTest_padded, pad = pad_rightdown_corner(
imageToTest, stride, padValue)
im = np.transpose(
np.float32(imageToTest_padded[:, :, :, np.newaxis]),
(3, 2, 0, 1)) / 256 - 0.5
im = np.ascontiguousarray(im)
data = torch.from_numpy(im).float()
if torch.cuda.is_available():
data = data.cuda()
with torch.no_grad():
Mconv7_stage6_L1, Mconv7_stage6_L2 = self.model(data)
Mconv7_stage6_L1 = Mconv7_stage6_L1.cpu().numpy()
Mconv7_stage6_L2 = Mconv7_stage6_L2.cpu().numpy()
# extract outputs, resize, and remove padding
heatmap = np.transpose(np.squeeze(Mconv7_stage6_L2),
(1, 2, 0)) # output 1 is heatmaps
heatmap = cv2.resize(
heatmap, (0, 0),
fx=stride,
fy=stride,
interpolation=cv2.INTER_CUBIC)
heatmap = heatmap[:imageToTest_padded.shape[0]
- pad[2], :imageToTest_padded.shape[1]
- pad[3], :]
heatmap = cv2.resize(
heatmap, (oriImg.shape[1], oriImg.shape[0]),
interpolation=cv2.INTER_CUBIC)
paf = np.transpose(np.squeeze(Mconv7_stage6_L1),
(1, 2, 0)) # output 0 is PAFs
paf = cv2.resize(
paf, (0, 0),
fx=stride,
fy=stride,
interpolation=cv2.INTER_CUBIC)
paf = paf[:imageToTest_padded.shape[0]
- pad[2], :imageToTest_padded.shape[1] - pad[3], :]
paf = cv2.resize(
paf, (oriImg.shape[1], oriImg.shape[0]),
interpolation=cv2.INTER_CUBIC)
heatmap_avg += heatmap_avg + heatmap / len(multiplier)
paf_avg += +paf / len(multiplier)
all_peaks = []
peak_counter = 0
for part in range(bodyparts):
map_ori = heatmap_avg[:, :, part]
one_heatmap = gaussian_filter(map_ori, sigma=3)
map_left = np.zeros(one_heatmap.shape)
map_left[1:, :] = one_heatmap[:-1, :]
map_right = np.zeros(one_heatmap.shape)
map_right[:-1, :] = one_heatmap[1:, :]
map_up = np.zeros(one_heatmap.shape)
map_up[:, 1:] = one_heatmap[:, :-1]
map_down = np.zeros(one_heatmap.shape)
map_down[:, :-1] = one_heatmap[:, 1:]
peaks_binary = np.logical_and.reduce(
(one_heatmap >= map_left, one_heatmap >= map_right,
one_heatmap >= map_up, one_heatmap >= map_down,
one_heatmap > thre1))
peaks = list(
zip(np.nonzero(peaks_binary)[1],
np.nonzero(peaks_binary)[0])) # note reverse
peaks_with_score = [x + (map_ori[x[1], x[0]], ) for x in peaks]
peak_id = range(peak_counter, peak_counter + len(peaks))
peaks_with_score_and_id = [
peaks_with_score[i] + (peak_id[i], )
for i in range(len(peak_id))
]
all_peaks.append(peaks_with_score_and_id)
peak_counter += len(peaks)
# find connection in the specified sequence, center 29 is in the position 15
limbSeq = [[2, 3], [2, 6], [3, 4], [4, 5], [6, 7], [7, 8], [2, 9],
[9, 10], [10, 11], [2, 12], [12, 13], [13, 14], [2, 1],
[1, 15], [15, 17], [1, 16], [16, 18], [3, 17], [6, 18]]
# the middle joints heatmap correpondence
mapIdx = [[31, 32], [39, 40], [33, 34], [35, 36], [41, 42], [43, 44],
[19, 20], [21, 22], [23, 24], [25, 26], [27, 28], [29, 30],
[47, 48], [49, 50], [53, 54], [51, 52], [55, 56], [37, 38],
[45, 46]]
connection_all = []
special_k = []
mid_num = 10
for k in range(len(mapIdx)):
score_mid = paf_avg[:, :, [x - 19 for x in mapIdx[k]]]
candA = all_peaks[limbSeq[k][0] - 1]
candB = all_peaks[limbSeq[k][1] - 1]
nA = len(candA)
nB = len(candB)
if (nA != 0 and nB != 0):
connection_candidate = []
for i in range(nA):
for j in range(nB):
vec = np.subtract(candB[j][:2], candA[i][:2])
norm = math.sqrt(vec[0] * vec[0] + vec[1] * vec[1])
norm = max(0.001, norm)
vec = np.divide(vec, norm)
startend = list(
zip(
np.linspace(
candA[i][0], candB[j][0], num=mid_num),
np.linspace(
candA[i][1], candB[j][1], num=mid_num)))
vec_x = np.array([
score_mid[int(round(startend[item][1])),
int(round(startend[item][0])), 0]
for item in range(len(startend))
])
vec_y = np.array([
score_mid[int(round(startend[item][1])),
int(round(startend[item][0])), 1]
for item in range(len(startend))
])
score_midpts = np.multiply(
vec_x, vec[0]) + np.multiply(vec_y, vec[1])
temp1 = sum(score_midpts) / len(score_midpts)
temp2 = min(0.5 * oriImg.shape[0] / norm - 1, 0)
score_with_dist_prior = temp1 + temp2
criterion1 = len(np.nonzero(
score_midpts > thre2)[0]) > 0.8 * len(score_midpts)
criterion2 = score_with_dist_prior > 0
if criterion1 and criterion2:
connection_candidate.append([
i, j, score_with_dist_prior,
score_with_dist_prior + candA[i][2]
+ candB[j][2]
])
connection_candidate = sorted(
connection_candidate, key=lambda x: x[2], reverse=True)
connection = np.zeros((0, 5))
for c in range(len(connection_candidate)):
i, j, s = connection_candidate[c][0:3]
if (i not in connection[:, 3]
and j not in connection[:, 4]):
connection = np.vstack(
[connection, [candA[i][3], candB[j][3], s, i, j]])
if (len(connection) >= min(nA, nB)):
break
connection_all.append(connection)
else:
special_k.append(k)
connection_all.append([])
# last number in each row is the total parts number of that person
# the second last number in each row is the score of the overall configuration
subset = -1 * np.ones((0, 20))
candidate = np.array(
[item for sublist in all_peaks for item in sublist])
for k in range(len(mapIdx)):
if k not in special_k:
partAs = connection_all[k][:, 0]
partBs = connection_all[k][:, 1]
indexA, indexB = np.array(limbSeq[k]) - 1
for i in range(len(connection_all[k])): # = 1:size(temp,1)
found = 0
subset_idx = [-1, -1]
for j in range(len(subset)): # 1:size(subset,1):
if subset[j][indexA] == partAs[i] or subset[j][
indexB] == partBs[i]:
subset_idx[found] = j
found += 1
if found == 1:
j = subset_idx[0]
if subset[j][indexB] != partBs[i]:
subset[j][indexB] = partBs[i]
subset[j][-1] += 1
subset[j][-2] += candidate[
partBs[i].astype(int),
2] + connection_all[k][i][2]
elif found == 2: # if found 2 and disjoint, merge them
j1, j2 = subset_idx
tmp1 = (subset[j1] >= 0).astype(int)
tmp2 = (subset[j2] >= 0).astype(int)
membership = (tmp1 + tmp2)[:-2]
if len(np.nonzero(membership == 2)[0]) == 0: # merge
subset[j1][:-2] += (subset[j2][:-2] + 1)
subset[j1][-2:] += subset[j2][-2:]
subset[j1][-2] += connection_all[k][i][2]
subset = np.delete(subset, j2, 0)
else: # as like found == 1
subset[j1][indexB] = partBs[i]
subset[j1][-1] += 1
subset[j1][-2] += candidate[
partBs[i].astype(int),
2] + connection_all[k][i][2]
# if find no partA in the subset, create a new subset
elif not found and k < 17:
row = -1 * np.ones(20)
row[indexA] = partAs[i]
row[indexB] = partBs[i]
row[-1] = 2
row[-2] = sum(
candidate[connection_all[k][i, :2].astype(int),
2]) + connection_all[k][i][2]
subset = np.vstack([subset, row])
# delete some rows of subset which has few parts occur
deleteIdx = []
for i in range(len(subset)):
if subset[i][-1] < 4 or subset[i][-2] / subset[i][-1] < 0.4:
deleteIdx.append(i)
subset = np.delete(subset, deleteIdx, axis=0)
# subset: n*20 array, 0-17 is the index in candidate, 18 is the total score, 19 is the total parts
# candidate: x, y, score, id
count = subset.shape[0]
joints = np.zeros(shape=(count, bodyparts, 3))
for i in range(count):
for j in range(bodyparts):
joints[i, j, :3] = candidate[int(subset[i, j]), :3]
confidence = 1.0 if subset[i, j] >= 0 else 0.0
joints[i, j, 2] *= confidence
return joints

View File

@@ -0,0 +1,141 @@
# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
from collections import OrderedDict
import torch
import torch.nn as nn
def make_layers(block, no_relu_layers):
layers = []
for layer_name, v in block.items():
if 'pool' in layer_name:
layer = nn.MaxPool2d(kernel_size=v[0], stride=v[1], padding=v[2])
layers.append((layer_name, layer))
else:
conv2d = nn.Conv2d(
in_channels=v[0],
out_channels=v[1],
kernel_size=v[2],
stride=v[3],
padding=v[4])
layers.append((layer_name, conv2d))
if layer_name not in no_relu_layers:
layers.append(('relu_' + layer_name, nn.ReLU(inplace=True)))
return nn.Sequential(OrderedDict(layers))
class BodyposeModel(nn.Module):
def __init__(self):
super(BodyposeModel, self).__init__()
# these layers have no relu layer
no_relu_layers = [
'conv5_5_CPM_L1', 'conv5_5_CPM_L2', 'Mconv7_stage2_L1',
'Mconv7_stage2_L2', 'Mconv7_stage3_L1', 'Mconv7_stage3_L2',
'Mconv7_stage4_L1', 'Mconv7_stage4_L2', 'Mconv7_stage5_L1',
'Mconv7_stage5_L2', 'Mconv7_stage6_L1', 'Mconv7_stage6_L1'
]
blocks = {}
block0 = OrderedDict([('conv1_1', [3, 64, 3, 1, 1]),
('conv1_2', [64, 64, 3, 1, 1]),
('pool1_stage1', [2, 2, 0]),
('conv2_1', [64, 128, 3, 1, 1]),
('conv2_2', [128, 128, 3, 1, 1]),
('pool2_stage1', [2, 2, 0]),
('conv3_1', [128, 256, 3, 1, 1]),
('conv3_2', [256, 256, 3, 1, 1]),
('conv3_3', [256, 256, 3, 1, 1]),
('conv3_4', [256, 256, 3, 1, 1]),
('pool3_stage1', [2, 2, 0]),
('conv4_1', [256, 512, 3, 1, 1]),
('conv4_2', [512, 512, 3, 1, 1]),
('conv4_3_CPM', [512, 256, 3, 1, 1]),
('conv4_4_CPM', [256, 128, 3, 1, 1])])
# Stage 1
block1_1 = OrderedDict([('conv5_1_CPM_L1', [128, 128, 3, 1, 1]),
('conv5_2_CPM_L1', [128, 128, 3, 1, 1]),
('conv5_3_CPM_L1', [128, 128, 3, 1, 1]),
('conv5_4_CPM_L1', [128, 512, 1, 1, 0]),
('conv5_5_CPM_L1', [512, 38, 1, 1, 0])])
block1_2 = OrderedDict([('conv5_1_CPM_L2', [128, 128, 3, 1, 1]),
('conv5_2_CPM_L2', [128, 128, 3, 1, 1]),
('conv5_3_CPM_L2', [128, 128, 3, 1, 1]),
('conv5_4_CPM_L2', [128, 512, 1, 1, 0]),
('conv5_5_CPM_L2', [512, 19, 1, 1, 0])])
blocks['block1_1'] = block1_1
blocks['block1_2'] = block1_2
self.model0 = make_layers(block0, no_relu_layers)
# Stages 2 - 6
for i in range(2, 7):
blocks['block%d_1' % i] = OrderedDict([
('Mconv1_stage%d_L1' % i, [185, 128, 7, 1, 3]),
('Mconv2_stage%d_L1' % i, [128, 128, 7, 1, 3]),
('Mconv3_stage%d_L1' % i, [128, 128, 7, 1, 3]),
('Mconv4_stage%d_L1' % i, [128, 128, 7, 1, 3]),
('Mconv5_stage%d_L1' % i, [128, 128, 7, 1, 3]),
('Mconv6_stage%d_L1' % i, [128, 128, 1, 1, 0]),
('Mconv7_stage%d_L1' % i, [128, 38, 1, 1, 0])
])
blocks['block%d_2' % i] = OrderedDict([
('Mconv1_stage%d_L2' % i, [185, 128, 7, 1, 3]),
('Mconv2_stage%d_L2' % i, [128, 128, 7, 1, 3]),
('Mconv3_stage%d_L2' % i, [128, 128, 7, 1, 3]),
('Mconv4_stage%d_L2' % i, [128, 128, 7, 1, 3]),
('Mconv5_stage%d_L2' % i, [128, 128, 7, 1, 3]),
('Mconv6_stage%d_L2' % i, [128, 128, 1, 1, 0]),
('Mconv7_stage%d_L2' % i, [128, 19, 1, 1, 0])
])
for k in blocks.keys():
blocks[k] = make_layers(blocks[k], no_relu_layers)
self.model1_1 = blocks['block1_1']
self.model2_1 = blocks['block2_1']
self.model3_1 = blocks['block3_1']
self.model4_1 = blocks['block4_1']
self.model5_1 = blocks['block5_1']
self.model6_1 = blocks['block6_1']
self.model1_2 = blocks['block1_2']
self.model2_2 = blocks['block2_2']
self.model3_2 = blocks['block3_2']
self.model4_2 = blocks['block4_2']
self.model5_2 = blocks['block5_2']
self.model6_2 = blocks['block6_2']
def forward(self, x):
out1 = self.model0(x)
out1_1 = self.model1_1(out1)
out1_2 = self.model1_2(out1)
out2 = torch.cat([out1_1, out1_2, out1], 1)
out2_1 = self.model2_1(out2)
out2_2 = self.model2_2(out2)
out3 = torch.cat([out2_1, out2_2, out1], 1)
out3_1 = self.model3_1(out3)
out3_2 = self.model3_2(out3)
out4 = torch.cat([out3_1, out3_2, out1], 1)
out4_1 = self.model4_1(out4)
out4_2 = self.model4_2(out4)
out5 = torch.cat([out4_1, out4_2, out1], 1)
out5_1 = self.model5_1(out5)
out5_2 = self.model5_2(out5)
out6 = torch.cat([out5_1, out5_2, out1], 1)
out6_1 = self.model6_1(out6)
out6_2 = self.model6_2(out6)
return out6_1, out6_2

View File

@@ -0,0 +1,33 @@
# The implementation is based on openpose, available at https://github.com/Hzzone/pytorch-openpose.
import numpy as np
def pad_rightdown_corner(img, stride, padValue):
h = img.shape[0]
w = img.shape[1]
pad = 4 * [None]
pad[0] = 0 # up
pad[1] = 0 # left
pad[2] = 0 if (h % stride == 0) else stride - (h % stride) # down
pad[3] = 0 if (w % stride == 0) else stride - (w % stride) # right
img_padded = img
pad_up = np.tile(img_padded[0:1, :, :] * 0 + padValue, (pad[0], 1, 1))
img_padded = np.concatenate((pad_up, img_padded), axis=0)
pad_left = np.tile(img_padded[:, 0:1, :] * 0 + padValue, (1, pad[1], 1))
img_padded = np.concatenate((pad_left, img_padded), axis=1)
pad_down = np.tile(img_padded[-2:-1, :, :] * 0 + padValue, (pad[2], 1, 1))
img_padded = np.concatenate((img_padded, pad_down), axis=0)
pad_right = np.tile(img_padded[:, -2:-1, :] * 0 + padValue, (1, pad[3], 1))
img_padded = np.concatenate((img_padded, pad_right), axis=1)
return img_padded, pad
def transfer(model, model_weights):
transfered_model_weights = {}
for weights_name in model.state_dict().keys():
transfered_model_weights[weights_name] = model_weights['.'.join(
weights_name.split('.')[1:])]
return transfered_model_weights

View File

@@ -0,0 +1,507 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import math
import os
import random
import cv2
import numba
import numpy as np
import torch
def resize_on_long_side(img, long_side=800):
src_height = img.shape[0]
src_width = img.shape[1]
if src_height > src_width:
scale = long_side * 1.0 / src_height
_img = cv2.resize(
img, (int(src_width * scale), long_side),
interpolation=cv2.INTER_LINEAR)
else:
scale = long_side * 1.0 / src_width
_img = cv2.resize(
img, (long_side, int(src_height * scale)),
interpolation=cv2.INTER_LINEAR)
return _img, scale
def point_in_box(pt, box):
pt_x = pt[0]
pt_y = pt[1]
if pt_x >= box[0] and pt_x <= box[0] + box[2] and pt_y >= box[
1] and pt_y <= box[1] + box[3]:
return True
else:
return False
def enlarge_box_tblr(roi_bbox, mask, ratio=0.4, use_long_side=True):
if roi_bbox is None or None in roi_bbox:
return [None, None, None, None]
top = roi_bbox[0]
bottom = roi_bbox[1]
left = roi_bbox[2]
right = roi_bbox[3]
roi_width = roi_bbox[3] - roi_bbox[2]
roi_height = roi_bbox[1] - roi_bbox[0]
right = left + roi_width
bottom = top + roi_height
long_side = roi_width if roi_width > roi_height else roi_height
if use_long_side:
new_left = left - int(long_side * ratio)
else:
new_left = left - int(roi_width * ratio)
new_left = 1 if new_left < 0 else new_left
if use_long_side:
new_top = top - int(long_side * ratio)
else:
new_top = top - int(roi_height * ratio)
new_top = 1 if new_top < 0 else new_top
if use_long_side:
new_right = right + int(long_side * ratio)
else:
new_right = right + int(roi_width * ratio)
new_right = mask.shape[1] - 2 if new_right > mask.shape[1] else new_right
if use_long_side:
new_bottom = bottom + int(long_side * ratio)
else:
new_bottom = bottom + int(roi_height * ratio)
new_bottom = mask.shape[0] - 2 if new_bottom > mask.shape[0] else new_bottom
bbox = [new_top, new_bottom, new_left, new_right]
return bbox
def gen_PAF(image, joints):
assert joints.shape[0] == 18
assert joints.shape[1] == 3
org_h = image.shape[0]
org_w = image.shape[1]
small_image, resize_scale = resize_on_long_side(image, 120)
joints[:, :2] = joints[:, :2] * resize_scale
joint_left = int(np.min(joints, axis=0)[0])
joint_right = int(np.max(joints, axis=0)[0])
joint_top = int(np.min(joints, axis=0)[1])
joint_bottom = int(np.max(joints, axis=0)[1])
limb_width = min(
abs(joint_right - joint_left), abs(joint_bottom - joint_top)) // 6
if limb_width % 2 == 0:
limb_width += 1
kernel_size = limb_width
part_orders = [(5, 11), (2, 8), (5, 6), (6, 7), (2, 3), (3, 4), (11, 12),
(12, 13), (8, 9), (9, 10)]
map_list = []
mask_list = []
PAF_all = np.zeros(
shape=(small_image.shape[0], small_image.shape[1], 2),
dtype=np.float32)
for c, pair in enumerate(part_orders):
idx_a_name = pair[0]
idx_b_name = pair[1]
jointa = joints[idx_a_name]
jointb = joints[idx_b_name]
confidence_threshold = 0.05
if jointa[2] > confidence_threshold and jointb[
2] > confidence_threshold:
canvas = np.zeros(
shape=(small_image.shape[0], small_image.shape[1]),
dtype=np.uint8)
canvas = cv2.line(canvas, (int(jointa[0]), int(jointa[1])),
(int(jointb[0]), int(jointb[1])),
(255, 255, 255), 5)
kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
(kernel_size, kernel_size))
canvas = cv2.dilate(canvas, kernel, 1)
canvas = cv2.GaussianBlur(canvas, (kernel_size, kernel_size), 0)
canvas = canvas.astype(np.float32) / 255
PAF = np.zeros(
shape=(small_image.shape[0], small_image.shape[1], 2),
dtype=np.float32)
PAF[..., 0] = jointb[0] - jointa[0]
PAF[..., 1] = jointb[1] - jointa[1]
mag, ang = cv2.cartToPolar(PAF[..., 0], PAF[..., 1])
PAF /= (np.dstack((mag, mag)) + 1e-5)
single_PAF = PAF * np.dstack((canvas, canvas))
map_list.append(
cv2.GaussianBlur(single_PAF,
(kernel_size * 3, kernel_size * 3), 0))
mask_list.append(
cv2.GaussianBlur(canvas.copy(),
(kernel_size * 3, kernel_size * 3), 0))
PAF_all = PAF_all * (1.0 - np.dstack(
(canvas, canvas))) + single_PAF
PAF_all = cv2.GaussianBlur(PAF_all, (kernel_size * 3, kernel_size * 3), 0)
PAF_all = cv2.resize(
PAF_all, (org_w, org_h), interpolation=cv2.INTER_LINEAR)
map_list.append(PAF_all)
return PAF_all, map_list, mask_list
def gen_skeleton_map(joints, stack_mode='column', input_roi_box=None):
if type(joints) == list:
joints = np.array(joints)
assert stack_mode == 'column' or stack_mode == 'depth'
part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
(3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]
def link(img, a, b, color, line_width, scale=1.0, x_offset=0, y_offset=0):
jointa = joints[a]
jointb = joints[b]
temp1 = int((jointa[0] - x_offset) * scale)
temp2 = int((jointa[1] - y_offset) * scale)
temp3 = int((jointb[0] - x_offset) * scale)
temp4 = int((jointb[1] - y_offset) * scale)
cv2.line(img, (temp1, temp2), (temp3, temp4), color, line_width)
roi_box = input_roi_box
roi_box_width = roi_box[3] - roi_box[2]
roi_box_height = roi_box[1] - roi_box[0]
short_side_length = min(roi_box_width, roi_box_height)
line_width = short_side_length // 30
line_width = max(line_width, 2)
map_cube = np.zeros(
shape=(roi_box_height, roi_box_width, len(part_orders) + 1),
dtype=np.float32)
use_line_width = min(5, line_width)
fx = use_line_width * 1.0 / line_width # fx 最大值为1
if fx < 0.99:
map_cube = cv2.resize(map_cube, (0, 0), fx=fx, fy=fx)
for c, pair in enumerate(part_orders):
tmp = map_cube[..., c].copy()
link(
tmp,
pair[0],
pair[1], (2.0, 2.0, 2.0),
use_line_width,
scale=fx,
x_offset=roi_box[2],
y_offset=roi_box[0])
map_cube[..., c] = tmp
tmp = map_cube[..., -1].copy()
link(
tmp,
pair[0],
pair[1], (2.0, 2.0, 2.0),
use_line_width,
scale=fx,
x_offset=roi_box[2],
y_offset=roi_box[0])
map_cube[..., -1] = tmp
map_cube = cv2.resize(map_cube, (roi_box_width, roi_box_height))
if stack_mode == 'depth':
return map_cube, roi_box
elif stack_mode == 'column':
joint_maps = []
for c in range(len(part_orders) + 1):
joint_maps.append(map_cube[..., c])
joint_map = np.column_stack(joint_maps)
return joint_map, roi_box
def plot_one_box(x, img, color=None, label=None, line_thickness=None):
tl = line_thickness or round(
0.002 * (img.shape[0] + img.shape[1]) / 2) + 1 # line/font thickness
color = color or [random.randint(0, 255) for _ in range(3)]
c1, c2 = (int(x[0]), int(x[1])), (int(x[2]), int(x[3]))
cv2.rectangle(img, c1, c2, color, thickness=tl, lineType=cv2.LINE_AA)
if label:
tf = max(tl - 1, 1) # font thickness
t_size = cv2.getTextSize(label, 0, fontScale=tl / 3, thickness=tf)[0]
c2 = c1[0] + t_size[0], c1[1] - t_size[1] - 3
cv2.rectangle(img, c1, c2, color, -1, cv2.LINE_AA) # filled
cv2.putText(
img,
label, (c1[0], c1[1] - 2),
0,
tl / 3, [225, 255, 255],
thickness=tf,
lineType=cv2.LINE_AA)
def draw_line(im, points, color, stroke_size=2, closed=False):
points = points.astype(np.int32)
for i in range(len(points) - 1):
cv2.line(im, tuple(points[i]), tuple(points[i + 1]), color,
stroke_size)
if closed:
cv2.line(im, tuple(points[0]), tuple(points[-1]), color, stroke_size)
def enlarged_bbox(bbox, img_width, img_height, enlarge_ratio=0.2):
left = bbox[0]
top = bbox[1]
right = bbox[2]
bottom = bbox[3]
roi_width = right - left
roi_height = bottom - top
new_left = left - int(roi_width * enlarge_ratio)
new_left = 0 if new_left < 0 else new_left
new_top = top - int(roi_height * enlarge_ratio)
new_top = 0 if new_top < 0 else new_top
new_right = right + int(roi_width * enlarge_ratio)
new_right = img_width if new_right > img_width else new_right
new_bottom = bottom + int(roi_height * enlarge_ratio)
new_bottom = img_height if new_bottom > img_height else new_bottom
bbox = [new_left, new_top, new_right, new_bottom]
bbox = [int(x) for x in bbox]
return bbox
def get_map_fusion_map_cuda(map_list, threshold=1, device=torch.device('cpu')):
map_list_cuda = [torch.from_numpy(x).to(device) for x in map_list]
map_concat = torch.stack(tuple(map_list_cuda), dim=-1)
map_concat = torch.abs(map_concat)
map_concat[map_concat < threshold] = 0
map_concat[map_concat > 1e-5] = 1.0
sum_map = torch.sum(map_concat, dim=2)
a = torch.ones_like(sum_map)
acc_map = torch.where(sum_map > 0, a * 2.0, torch.zeros_like(sum_map))
fusion_map = torch.where(sum_map < 0.5, a * 1.5, sum_map)
fusion_map = fusion_map.float()
acc_map = acc_map.float()
fusion_map = fusion_map.cpu().numpy().astype(np.float32)
acc_map = acc_map.cpu().numpy().astype(np.float32)
return fusion_map, acc_map
def gen_border_shade(height, width, height_band, width_band):
height_ratio = height_band * 1.0 / height
width_ratio = width_band * 1.0 / width
_height_band = int(256 * height_ratio)
_width_band = int(256 * width_ratio)
canvas = np.zeros((256, 256), dtype=np.float32)
canvas[_height_band // 2:-_height_band // 2,
_width_band // 2:-_width_band // 2] = 1.0
canvas = cv2.blur(canvas, (_height_band, _width_band))
canvas = cv2.resize(canvas, (width, height))
return canvas
def get_mask_bbox(mask, threshold=127):
ret, mask = cv2.threshold(mask, threshold, 1, 0)
if cv2.countNonZero(mask) == 0:
return [None, None, None, None]
col_acc = np.sum(mask, 0)
row_acc = np.sum(mask, 1)
col_acc = col_acc.tolist()
row_acc = row_acc.tolist()
for x in range(len(col_acc)):
if col_acc[x] > 0:
left = x
break
for x in range(1, len(col_acc)):
if col_acc[-x] > 0:
right = len(col_acc) - x
break
for x in range(len(row_acc)):
if row_acc[x] > 0:
top = x
break
for x in range(1, len(row_acc)):
if row_acc[-x] > 0:
bottom = len(row_acc[::-1]) - x
break
return [top, bottom, left, right]
def visualize_flow(flow):
h, w = flow.shape[:2]
hsv = np.zeros((h, w, 3), np.uint8)
mag, ang = cv2.cartToPolar(flow[..., 0], flow[..., 1])
hsv[..., 0] = ang * 180 / np.pi / 2
hsv[..., 1] = cv2.normalize(mag, None, 0, 255, cv2.NORM_MINMAX)
hsv[..., 2] = 255
bgr = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
bgr = bgr * 1.0 / 255
return bgr.astype(np.float32)
def vis_joints(image, joints, color, show_text=True, confidence_threshold=0.1):
part_orders = [(2, 5), (5, 11), (2, 8), (8, 11), (5, 6), (6, 7), (2, 3),
(3, 4), (11, 12), (12, 13), (8, 9), (9, 10)]
abandon_idxs = [0, 1, 14, 15, 16, 17]
# draw joints
for i, joint in enumerate(joints):
if i in abandon_idxs:
continue
if joint[-1] > confidence_threshold:
cv2.circle(image, (int(joint[0]), int(joint[1])), 1, color, 2)
if show_text:
cv2.putText(image,
str(i) + '[{:.2f}]'.format(joint[-1]),
(int(joint[0]), int(joint[1])),
cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
# draw link
for pair in part_orders:
if joints[pair[0]][-1] > confidence_threshold and joints[
pair[1]][-1] > confidence_threshold:
cv2.line(image, (int(joints[pair[0]][0]), int(joints[pair[0]][1])),
(int(joints[pair[1]][0]), int(joints[pair[1]][1])), color,
2)
return image
def get_heatmap_cv(img, magn, max_flow_mag):
min_flow_mag = .5
cv_magn = np.clip(
255 * (magn - min_flow_mag) / (max_flow_mag - min_flow_mag + 1e-7),
a_min=0,
a_max=255).astype(np.uint8)
if img.dtype != np.uint8:
img = (255 * img).astype(np.uint8)
heatmap_img = cv2.applyColorMap(cv_magn, cv2.COLORMAP_JET)
heatmap_img = heatmap_img[..., ::-1]
h, w = magn.shape
img_alpha = np.ones((h, w), dtype=np.double)[:, :, None]
heatmap_alpha = np.clip(
magn / (max_flow_mag + 1e-7), a_min=1e-7, a_max=1)[:, :, None]**.7
heatmap_alpha[heatmap_alpha < .2]**.5
pm_hm = heatmap_img * heatmap_alpha
pm_img = img * img_alpha
cv_out = pm_hm + pm_img * (1 - heatmap_alpha)
cv_out = np.clip(cv_out, a_min=0, a_max=255).astype(np.uint8)
return cv_out
def save_heatmap_cv(img, flow, supression=2):
flow_magn = np.sqrt(flow[:, :, 0]**2 + flow[:, :, 1]**2)
flow_magn -= supression
flow_magn[flow_magn <= 0] = 0
cv_out = get_heatmap_cv(img, flow_magn, np.max(flow_magn) * 1.3)
return cv_out
@numba.jit(nopython=True, parallel=False)
def bilinear_interp(x, y, v11, v12, v21, v22):
temp1 = (v11 * (1 - y) + v12 * y) * (1 - x)
temp2 = (v21 * (1 - y) + v22 * y) * x
result = temp1 + temp2
return result
@numba.jit(nopython=True, parallel=False)
def image_warp_grid1(rDx, rDy, oriImg, transRatio, width_expand,
height_expand):
srcW = oriImg.shape[1]
srcH = oriImg.shape[0]
newImg = oriImg.copy()
for i in range(srcH):
for j in range(srcW):
_i = i
_j = j
deltaX = rDx[_i, _j]
deltaY = rDy[_i, _j]
nx = _j + deltaX * transRatio
ny = _i + deltaY * transRatio
if nx >= srcW - width_expand - 1:
if nx > srcW - 1:
nx = srcW - 1
if ny >= srcH - height_expand - 1:
if ny > srcH - 1:
ny = srcH - 1
if nx < width_expand:
if nx < 0:
nx = 0
if ny < height_expand:
if ny < 0:
ny = 0
nxi = int(math.floor(nx))
nyi = int(math.floor(ny))
nxi1 = int(math.ceil(nx))
nyi1 = int(math.ceil(ny))
for ll in range(3):
newImg[_i, _j,
ll] = bilinear_interp(ny - nyi, nx - nxi,
oriImg[nyi, nxi,
ll], oriImg[nyi, nxi1, ll],
oriImg[nyi1, nxi,
ll], oriImg[nyi1, nxi1,
ll])
return newImg

View File

@@ -1,3 +1,6 @@
# The implementation is adopted from Jingwen He,
# made publicly available at https://github.com/hejingwenhejingwen/CSRNet
import functools
import math

View File

@@ -1,3 +1,4 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os.path as osp
from copy import deepcopy
from typing import Dict, Union

View File

@@ -1,3 +1,8 @@
# ------------------------------------------------------------------------
# Modified from https://github.com/megvii-research/NAFNet/blob/main/basicsr/models/archs/NAFNet_arch.py
# Copyright (c) 2022 megvii-model. All Rights Reserved.
# ------------------------------------------------------------------------
import numpy as np
import torch
import torch.nn as nn

View File

@@ -1,3 +1,8 @@
# ------------------------------------------------------------------------
# Modified from BasicSR (https://github.com/xinntao/BasicSR)
# Copyright 2018-2020 BasicSR Authors
# ------------------------------------------------------------------------
import torch
import torch.nn as nn

View File

@@ -1,8 +1,8 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from copy import deepcopy
from typing import Any, Dict, Union
import numpy as np
import torch.cuda
from torch.nn.parallel import DataParallel, DistributedDataParallel
@@ -77,13 +77,8 @@ class NAFNetForImageDenoise(TorchModel):
def _evaluate_postprocess(self, input: Tensor,
target: Tensor) -> Dict[str, list]:
preds = self.model(input)
preds = list(torch.split(preds, 1, 0))
targets = list(torch.split(target, 1, 0))
preds = [(pred.data * 255.).squeeze(0).permute(
1, 2, 0).cpu().numpy().astype(np.uint8) for pred in preds]
targets = [(target.data * 255.).squeeze(0).permute(
1, 2, 0).cpu().numpy().astype(np.uint8) for target in targets]
preds = list(torch.split(preds.clamp(0, 1), 1, 0))
targets = list(torch.split(target.clamp(0, 1), 1, 0))
return {'pred': preds, 'target': targets}

View File

@@ -4,11 +4,11 @@ from typing import TYPE_CHECKING
from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .image_denoise_dataset import PairedImageDataset
from .model import FFTInpainting
else:
_import_structure = {
'image_denoise_dataset': ['PairedImageDataset'],
'model': ['FFTInpainting'],
}
import sys

View File

@@ -0,0 +1,75 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
from typing import Dict, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
from modelscope.utils.logger import get_logger
from .modules.adversarial import NonSaturatingWithR1
from .modules.ffc import FFCResNetGenerator
from .modules.perceptual import ResNetPL
from .modules.pix2pixhd import NLayerDiscriminator
LOGGER = get_logger()
class BaseInpaintingTrainingModule(nn.Module):
def __init__(self,
model_dir='',
use_ddp=True,
predict_only=False,
visualize_each_iters=100,
average_generator=False,
generator_avg_beta=0.999,
average_generator_start_step=30000,
average_generator_period=10,
store_discr_outputs_for_vis=False,
**kwargs):
super().__init__()
LOGGER.info(
f'BaseInpaintingTrainingModule init called, predict_only is {predict_only}'
)
self.generator = FFCResNetGenerator()
self.use_ddp = use_ddp
if not predict_only:
self.discriminator = NLayerDiscriminator()
self.adversarial_loss = NonSaturatingWithR1(
weight=10,
gp_coef=0.001,
mask_as_fake_target=True,
allow_scale_mask=True)
self.average_generator = average_generator
self.generator_avg_beta = generator_avg_beta
self.average_generator_start_step = average_generator_start_step
self.average_generator_period = average_generator_period
self.generator_average = None
self.last_generator_averaging_step = -1
self.store_discr_outputs_for_vis = store_discr_outputs_for_vis
self.loss_l1 = nn.L1Loss(reduction='none')
self.loss_resnet_pl = ResNetPL(weight=30, weights_path=model_dir)
self.visualize_each_iters = visualize_each_iters
LOGGER.info('BaseInpaintingTrainingModule init done')
def forward(self, batch: Dict[str,
torch.Tensor]) -> Dict[str, torch.Tensor]:
"""Pass data through generator and obtain at leas 'predicted_image' and 'inpainted' keys"""
raise NotImplementedError()
def generator_loss(self,
batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
raise NotImplementedError()
def discriminator_loss(
self, batch) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
raise NotImplementedError()

View File

@@ -0,0 +1,210 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
import bisect
import torch
import torch.nn.functional as F
from modelscope.utils.logger import get_logger
from .base import BaseInpaintingTrainingModule
from .modules.feature_matching import feature_matching_loss, masked_l1_loss
LOGGER = get_logger()
def set_requires_grad(module, value):
for param in module.parameters():
param.requires_grad = value
def add_prefix_to_keys(dct, prefix):
return {prefix + k: v for k, v in dct.items()}
class LinearRamp:
def __init__(self, start_value=0, end_value=1, start_iter=-1, end_iter=0):
self.start_value = start_value
self.end_value = end_value
self.start_iter = start_iter
self.end_iter = end_iter
def __call__(self, i):
if i < self.start_iter:
return self.start_value
if i >= self.end_iter:
return self.end_value
part = (i - self.start_iter) / (self.end_iter - self.start_iter)
return self.start_value * (1 - part) + self.end_value * part
class LadderRamp:
def __init__(self, start_iters, values):
self.start_iters = start_iters
self.values = values
assert len(values) == len(start_iters) + 1, (len(values),
len(start_iters))
def __call__(self, i):
segment_i = bisect.bisect_right(self.start_iters, i)
return self.values[segment_i]
def get_ramp(kind='ladder', **kwargs):
if kind == 'linear':
return LinearRamp(**kwargs)
if kind == 'ladder':
return LadderRamp(**kwargs)
raise ValueError(f'Unexpected ramp kind: {kind}')
class DefaultInpaintingTrainingModule(BaseInpaintingTrainingModule):
def __init__(self,
model_dir='',
predict_only=False,
concat_mask=True,
rescale_scheduler_kwargs=None,
image_to_discriminator='predicted_image',
add_noise_kwargs=None,
noise_fill_hole=False,
const_area_crop_kwargs=None,
distance_weighter_kwargs=None,
distance_weighted_mask_for_discr=False,
fake_fakes_proba=0,
fake_fakes_generator_kwargs=None,
**kwargs):
super().__init__(model_dir=model_dir, predict_only=predict_only)
self.concat_mask = concat_mask
self.rescale_size_getter = get_ramp(
**rescale_scheduler_kwargs
) if rescale_scheduler_kwargs is not None else None
self.image_to_discriminator = image_to_discriminator
self.add_noise_kwargs = add_noise_kwargs
self.noise_fill_hole = noise_fill_hole
self.const_area_crop_kwargs = const_area_crop_kwargs
self.refine_mask_for_losses = None
self.distance_weighted_mask_for_discr = distance_weighted_mask_for_discr
self.feature_matching_weight = 100
self.losses_l1_weight_known = 10
self.losses_l1_weight_missing = 0
self.fake_fakes_proba = fake_fakes_proba
def forward(self, batch):
img = batch['image']
mask = batch['mask']
masked_img = img * (1 - mask)
if self.concat_mask:
masked_img = torch.cat([masked_img, mask], dim=1)
batch['predicted_image'] = self.generator(masked_img)
batch['inpainted'] = mask * batch['predicted_image'] + (
1 - mask) * batch['image']
batch['mask_for_losses'] = mask
return batch
def generator_loss(self, batch):
img = batch['image']
predicted_img = batch[self.image_to_discriminator]
original_mask = batch['mask']
supervised_mask = batch['mask_for_losses']
# L1
l1_value = masked_l1_loss(predicted_img, img, supervised_mask,
self.losses_l1_weight_known,
self.losses_l1_weight_missing)
total_loss = l1_value
metrics = dict(gen_l1=l1_value)
# discriminator
# adversarial_loss calls backward by itself
mask_for_discr = supervised_mask if self.distance_weighted_mask_for_discr else original_mask
self.adversarial_loss.pre_generator_step(
real_batch=img,
fake_batch=predicted_img,
generator=self.generator,
discriminator=self.discriminator)
discr_real_pred, discr_real_features = self.discriminator(img)
discr_fake_pred, discr_fake_features = self.discriminator(
predicted_img)
adv_gen_loss, adv_metrics = self.adversarial_loss.generator_loss(
real_batch=img,
fake_batch=predicted_img,
discr_real_pred=discr_real_pred,
discr_fake_pred=discr_fake_pred,
mask=mask_for_discr)
total_loss = total_loss + adv_gen_loss
metrics['gen_adv'] = adv_gen_loss
metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
# feature matching
if self.feature_matching_weight > 0:
need_mask_in_fm = False
mask_for_fm = supervised_mask if need_mask_in_fm else None
fm_value = feature_matching_loss(
discr_fake_features, discr_real_features,
mask=mask_for_fm) * self.feature_matching_weight
total_loss = total_loss + fm_value
metrics['gen_fm'] = fm_value
if self.loss_resnet_pl is not None:
resnet_pl_value = self.loss_resnet_pl(predicted_img, img)
total_loss = total_loss + resnet_pl_value
metrics['gen_resnet_pl'] = resnet_pl_value
return total_loss, metrics
def discriminator_loss(self, batch):
total_loss = 0
metrics = {}
predicted_img = batch[self.image_to_discriminator].detach()
self.adversarial_loss.pre_discriminator_step(
real_batch=batch['image'],
fake_batch=predicted_img,
generator=self.generator,
discriminator=self.discriminator)
discr_real_pred, discr_real_features = self.discriminator(
batch['image'])
discr_fake_pred, discr_fake_features = self.discriminator(
predicted_img)
adv_discr_loss, adv_metrics = self.adversarial_loss.discriminator_loss(
real_batch=batch['image'],
fake_batch=predicted_img,
discr_real_pred=discr_real_pred,
discr_fake_pred=discr_fake_pred,
mask=batch['mask'])
total_loss = (total_loss + adv_discr_loss) * 0.1
metrics['discr_adv'] = adv_discr_loss
metrics.update(add_prefix_to_keys(adv_metrics, 'adv_'))
return total_loss, metrics
def _do_step(self, batch, optimizer_idx=None):
if optimizer_idx == 0: # step for generator
set_requires_grad(self.generator, True)
set_requires_grad(self.discriminator, False)
elif optimizer_idx == 1: # step for discriminator
set_requires_grad(self.generator, False)
set_requires_grad(self.discriminator, True)
batch = self(batch)
total_loss = 0
if optimizer_idx is None or optimizer_idx == 0: # step for generator
total_loss, metrics = self.generator_loss(batch)
elif optimizer_idx is None or optimizer_idx == 1: # step for discriminator
total_loss, metrics = self.discriminator_loss(batch)
result = dict(loss=total_loss)
return result

View File

@@ -0,0 +1,36 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import os
from typing import Any, Dict, Optional, Union
import torch
from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.utils.constant import ModelFile, Tasks
from modelscope.utils.logger import get_logger
LOGGER = get_logger()
@MODELS.register_module(
Tasks.image_inpainting, module_name=Models.image_inpainting)
class FFTInpainting(TorchModel):
def __init__(self, model_dir: str, **kwargs):
super().__init__(model_dir, **kwargs)
from .default import DefaultInpaintingTrainingModule
pretrained = kwargs.get('pretrained', True)
predict_only = kwargs.get('predict_only', False)
net = DefaultInpaintingTrainingModule(
model_dir=model_dir, predict_only=predict_only)
if pretrained:
path = os.path.join(model_dir, ModelFile.TORCH_MODEL_FILE)
LOGGER.info(f'loading pretrained model from {path}')
state = torch.load(path, map_location='cpu')
net.load_state_dict(state, strict=False)
self.model = net
def forward(self, inputs):
return self.model(inputs)

View File

@@ -0,0 +1,2 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
from .base import ModelBuilder

View File

@@ -0,0 +1,380 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.nn.modules import BatchNorm2d
from . import resnet
NUM_CLASS = 150
# Model Builder
class ModelBuilder:
# custom weights initialization
@staticmethod
def weights_init(m):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
nn.init.kaiming_normal_(m.weight.data)
elif classname.find('BatchNorm') != -1:
m.weight.data.fill_(1.)
m.bias.data.fill_(1e-4)
@staticmethod
def build_encoder(arch='resnet50dilated',
fc_dim=512,
weights='',
model_dir=''):
pretrained = True if len(weights) == 0 else False
arch = arch.lower()
if arch == 'resnet50dilated':
orig_resnet = resnet.__dict__['resnet50'](
pretrained=pretrained, model_dir=model_dir)
net_encoder = ResnetDilated(orig_resnet, dilate_scale=8)
elif arch == 'resnet50':
orig_resnet = resnet.__dict__['resnet50'](
pretrained=pretrained, model_dir=model_dir)
net_encoder = Resnet(orig_resnet)
else:
raise Exception('Architecture undefined!')
# encoders are usually pretrained
# net_encoder.apply(ModelBuilder.weights_init)
if len(weights) > 0:
print('Loading weights for net_encoder')
net_encoder.load_state_dict(
torch.load(weights, map_location=lambda storage, loc: storage),
strict=False)
return net_encoder
@staticmethod
def build_decoder(arch='ppm_deepsup',
fc_dim=512,
num_class=NUM_CLASS,
weights='',
use_softmax=False,
drop_last_conv=False):
arch = arch.lower()
if arch == 'ppm_deepsup':
net_decoder = PPMDeepsup(
num_class=num_class,
fc_dim=fc_dim,
use_softmax=use_softmax,
drop_last_conv=drop_last_conv)
elif arch == 'c1_deepsup':
net_decoder = C1DeepSup(
num_class=num_class,
fc_dim=fc_dim,
use_softmax=use_softmax,
drop_last_conv=drop_last_conv)
else:
raise Exception('Architecture undefined!')
net_decoder.apply(ModelBuilder.weights_init)
if len(weights) > 0:
print('Loading weights for net_decoder')
net_decoder.load_state_dict(
torch.load(weights, map_location=lambda storage, loc: storage),
strict=False)
return net_decoder
@staticmethod
def get_decoder(weights_path, arch_encoder, arch_decoder, fc_dim,
drop_last_conv, *arts, **kwargs):
path = os.path.join(
weights_path, 'ade20k',
f'ade20k-{arch_encoder}-{arch_decoder}/decoder_epoch_20.pth')
return ModelBuilder.build_decoder(
arch=arch_decoder,
fc_dim=fc_dim,
weights=path,
use_softmax=True,
drop_last_conv=drop_last_conv)
@staticmethod
def get_encoder(weights_path, arch_encoder, arch_decoder, fc_dim,
segmentation, *arts, **kwargs):
if segmentation:
path = os.path.join(
weights_path, 'ade20k',
f'ade20k-{arch_encoder}-{arch_decoder}/encoder_epoch_20.pth')
else:
path = ''
return ModelBuilder.build_encoder(
arch=arch_encoder,
fc_dim=fc_dim,
weights=path,
model_dir=weights_path)
def conv3x3_bn_relu(in_planes, out_planes, stride=1):
return nn.Sequential(
nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False),
BatchNorm2d(out_planes),
nn.ReLU(inplace=True),
)
# pyramid pooling, deep supervision
class PPMDeepsup(nn.Module):
def __init__(self,
num_class=NUM_CLASS,
fc_dim=4096,
use_softmax=False,
pool_scales=(1, 2, 3, 6),
drop_last_conv=False):
super().__init__()
self.use_softmax = use_softmax
self.drop_last_conv = drop_last_conv
self.ppm = []
for scale in pool_scales:
self.ppm.append(
nn.Sequential(
nn.AdaptiveAvgPool2d(scale),
nn.Conv2d(fc_dim, 512, kernel_size=1, bias=False),
BatchNorm2d(512), nn.ReLU(inplace=True)))
self.ppm = nn.ModuleList(self.ppm)
self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)
self.conv_last = nn.Sequential(
nn.Conv2d(
fc_dim + len(pool_scales) * 512,
512,
kernel_size=3,
padding=1,
bias=False), BatchNorm2d(512), nn.ReLU(inplace=True),
nn.Dropout2d(0.1), nn.Conv2d(512, num_class, kernel_size=1))
self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
self.dropout_deepsup = nn.Dropout2d(0.1)
def forward(self, conv_out, segSize=None):
conv5 = conv_out[-1]
input_size = conv5.size()
ppm_out = [conv5]
for pool_scale in self.ppm:
ppm_out.append(
nn.functional.interpolate(
pool_scale(conv5), (input_size[2], input_size[3]),
mode='bilinear',
align_corners=False))
ppm_out = torch.cat(ppm_out, 1)
if self.drop_last_conv:
return ppm_out
else:
x = self.conv_last(ppm_out)
if self.use_softmax: # is True during inference
x = nn.functional.interpolate(
x, size=segSize, mode='bilinear', align_corners=False)
x = nn.functional.softmax(x, dim=1)
return x
# deep sup
conv4 = conv_out[-2]
_ = self.cbr_deepsup(conv4)
_ = self.dropout_deepsup(_)
_ = self.conv_last_deepsup(_)
x = nn.functional.log_softmax(x, dim=1)
_ = nn.functional.log_softmax(_, dim=1)
return (x, _)
class Resnet(nn.Module):
def __init__(self, orig_resnet):
super(Resnet, self).__init__()
# take pretrained resnet, except AvgPool and FC
self.conv1 = orig_resnet.conv1
self.bn1 = orig_resnet.bn1
self.relu1 = orig_resnet.relu1
self.conv2 = orig_resnet.conv2
self.bn2 = orig_resnet.bn2
self.relu2 = orig_resnet.relu2
self.conv3 = orig_resnet.conv3
self.bn3 = orig_resnet.bn3
self.relu3 = orig_resnet.relu3
self.maxpool = orig_resnet.maxpool
self.layer1 = orig_resnet.layer1
self.layer2 = orig_resnet.layer2
self.layer3 = orig_resnet.layer3
self.layer4 = orig_resnet.layer4
def forward(self, x, return_feature_maps=False):
conv_out = []
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
x = self.relu3(self.bn3(self.conv3(x)))
x = self.maxpool(x)
x = self.layer1(x)
conv_out.append(x)
x = self.layer2(x)
conv_out.append(x)
x = self.layer3(x)
conv_out.append(x)
x = self.layer4(x)
conv_out.append(x)
if return_feature_maps:
return conv_out
return [x]
# Resnet Dilated
class ResnetDilated(nn.Module):
def __init__(self, orig_resnet, dilate_scale=8):
super().__init__()
from functools import partial
if dilate_scale == 8:
orig_resnet.layer3.apply(partial(self._nostride_dilate, dilate=2))
orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=4))
elif dilate_scale == 16:
orig_resnet.layer4.apply(partial(self._nostride_dilate, dilate=2))
# take pretrained resnet, except AvgPool and FC
self.conv1 = orig_resnet.conv1
self.bn1 = orig_resnet.bn1
self.relu1 = orig_resnet.relu1
self.conv2 = orig_resnet.conv2
self.bn2 = orig_resnet.bn2
self.relu2 = orig_resnet.relu2
self.conv3 = orig_resnet.conv3
self.bn3 = orig_resnet.bn3
self.relu3 = orig_resnet.relu3
self.maxpool = orig_resnet.maxpool
self.layer1 = orig_resnet.layer1
self.layer2 = orig_resnet.layer2
self.layer3 = orig_resnet.layer3
self.layer4 = orig_resnet.layer4
def _nostride_dilate(self, m, dilate):
classname = m.__class__.__name__
if classname.find('Conv') != -1:
# the convolution with stride
if m.stride == (2, 2):
m.stride = (1, 1)
if m.kernel_size == (3, 3):
m.dilation = (dilate // 2, dilate // 2)
m.padding = (dilate // 2, dilate // 2)
# other convoluions
else:
if m.kernel_size == (3, 3):
m.dilation = (dilate, dilate)
m.padding = (dilate, dilate)
def forward(self, x, return_feature_maps=False):
conv_out = []
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
x = self.relu3(self.bn3(self.conv3(x)))
x = self.maxpool(x)
x = self.layer1(x)
conv_out.append(x)
x = self.layer2(x)
conv_out.append(x)
x = self.layer3(x)
conv_out.append(x)
x = self.layer4(x)
conv_out.append(x)
if return_feature_maps:
return conv_out
return [x]
# last conv, deep supervision
class C1DeepSup(nn.Module):
def __init__(self,
num_class=150,
fc_dim=2048,
use_softmax=False,
drop_last_conv=False):
super(C1DeepSup, self).__init__()
self.use_softmax = use_softmax
self.drop_last_conv = drop_last_conv
self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
self.cbr_deepsup = conv3x3_bn_relu(fc_dim // 2, fc_dim // 4, 1)
# last conv
self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
self.conv_last_deepsup = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
def forward(self, conv_out, segSize=None):
conv5 = conv_out[-1]
x = self.cbr(conv5)
if self.drop_last_conv:
return x
else:
x = self.conv_last(x)
if self.use_softmax: # is True during inference
x = nn.functional.interpolate(
x, size=segSize, mode='bilinear', align_corners=False)
x = nn.functional.softmax(x, dim=1)
return x
# deep sup
conv4 = conv_out[-2]
_ = self.cbr_deepsup(conv4)
_ = self.conv_last_deepsup(_)
x = nn.functional.log_softmax(x, dim=1)
_ = nn.functional.log_softmax(_, dim=1)
return (x, _)
# last conv
class C1(nn.Module):
def __init__(self, num_class=150, fc_dim=2048, use_softmax=False):
super(C1, self).__init__()
self.use_softmax = use_softmax
self.cbr = conv3x3_bn_relu(fc_dim, fc_dim // 4, 1)
# last conv
self.conv_last = nn.Conv2d(fc_dim // 4, num_class, 1, 1, 0)
def forward(self, conv_out, segSize=None):
conv5 = conv_out[-1]
x = self.cbr(conv5)
x = self.conv_last(x)
if self.use_softmax: # is True during inference
x = nn.functional.interpolate(
x, size=segSize, mode='bilinear', align_corners=False)
x = nn.functional.softmax(x, dim=1)
else:
x = nn.functional.log_softmax(x, dim=1)
return x

View File

@@ -0,0 +1,183 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
import math
import os
import torch
import torch.nn as nn
from torch.nn import BatchNorm2d
__all__ = ['ResNet', 'resnet50']
def conv3x3(in_planes, out_planes, stride=1):
'3x3 convolution with padding'
return nn.Conv2d(
in_planes,
out_planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(BasicBlock, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn1 = BatchNorm2d(planes)
self.relu = nn.ReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.bn2 = BatchNorm2d(planes)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class Bottleneck(nn.Module):
expansion = 4
def __init__(self, inplanes, planes, stride=1, downsample=None):
super(Bottleneck, self).__init__()
self.conv1 = nn.Conv2d(inplanes, planes, kernel_size=1, bias=False)
self.bn1 = BatchNorm2d(planes)
self.conv2 = nn.Conv2d(
planes,
planes,
kernel_size=3,
stride=stride,
padding=1,
bias=False)
self.bn2 = BatchNorm2d(planes)
self.conv3 = nn.Conv2d(planes, planes * 4, kernel_size=1, bias=False)
self.bn3 = BatchNorm2d(planes * 4)
self.relu = nn.ReLU(inplace=True)
self.downsample = downsample
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn1(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn2(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn3(out)
if self.downsample is not None:
residual = self.downsample(x)
out += residual
out = self.relu(out)
return out
class ResNet(nn.Module):
def __init__(self, block, layers, num_classes=1000):
self.inplanes = 128
super(ResNet, self).__init__()
self.conv1 = conv3x3(3, 64, stride=2)
self.bn1 = BatchNorm2d(64)
self.relu1 = nn.ReLU(inplace=True)
self.conv2 = conv3x3(64, 64)
self.bn2 = BatchNorm2d(64)
self.relu2 = nn.ReLU(inplace=True)
self.conv3 = conv3x3(64, 128)
self.bn3 = BatchNorm2d(128)
self.relu3 = nn.ReLU(inplace=True)
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
self.layer1 = self._make_layer(block, 64, layers[0])
self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
self.avgpool = nn.AvgPool2d(7, stride=1)
self.fc = nn.Linear(512 * block.expansion, num_classes)
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
def _make_layer(self, block, planes, blocks, stride=1):
downsample = None
if stride != 1 or self.inplanes != planes * block.expansion:
downsample = nn.Sequential(
nn.Conv2d(
self.inplanes,
planes * block.expansion,
kernel_size=1,
stride=stride,
bias=False),
BatchNorm2d(planes * block.expansion),
)
layers = []
layers.append(block(self.inplanes, planes, stride, downsample))
self.inplanes = planes * block.expansion
for i in range(1, blocks):
layers.append(block(self.inplanes, planes))
return nn.Sequential(*layers)
def forward(self, x):
x = self.relu1(self.bn1(self.conv1(x)))
x = self.relu2(self.bn2(self.conv2(x)))
x = self.relu3(self.bn3(self.conv3(x)))
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.avgpool(x)
x = x.view(x.size(0), -1)
x = self.fc(x)
return x
def resnet50(pretrained=False, model_dir='', **kwargs):
"""Constructs a ResNet-50 model.
Args:
pretrained (bool): If True, returns a model pre-trained on ImageNet
"""
model = ResNet(Bottleneck, [3, 4, 6, 3], **kwargs)
if pretrained:
cached_file = os.path.join(model_dir, 'resnet50-imagenet.pth')
model.load_state_dict(
torch.load(cached_file, map_location='cpu'), strict=False)
return model

View File

@@ -0,0 +1,167 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
from typing import Dict, Optional, Tuple
import torch
import torch.nn as nn
import torch.nn.functional as F
class BaseAdversarialLoss:
def pre_generator_step(self, real_batch: torch.Tensor,
fake_batch: torch.Tensor, generator: nn.Module,
discriminator: nn.Module):
"""
Prepare for generator step
:param real_batch: Tensor, a batch of real samples
:param fake_batch: Tensor, a batch of samples produced by generator
:param generator:
:param discriminator:
:return: None
"""
def pre_discriminator_step(self, real_batch: torch.Tensor,
fake_batch: torch.Tensor, generator: nn.Module,
discriminator: nn.Module):
"""
Prepare for discriminator step
:param real_batch: Tensor, a batch of real samples
:param fake_batch: Tensor, a batch of samples produced by generator
:param generator:
:param discriminator:
:return: None
"""
def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
mask: Optional[torch.Tensor] = None) \
-> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
"""
Calculate generator loss
:param real_batch: Tensor, a batch of real samples
:param fake_batch: Tensor, a batch of samples produced by generator
:param discr_real_pred: Tensor, discriminator output for real_batch
:param discr_fake_pred: Tensor, discriminator output for fake_batch
:param mask: Tensor, actual mask, which was at input of generator when making fake_batch
:return: total generator loss along with some values that might be interesting to log
"""
raise NotImplementedError
def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
mask: Optional[torch.Tensor] = None) \
-> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
"""
Calculate discriminator loss and call .backward() on it
:param real_batch: Tensor, a batch of real samples
:param fake_batch: Tensor, a batch of samples produced by generator
:param discr_real_pred: Tensor, discriminator output for real_batch
:param discr_fake_pred: Tensor, discriminator output for fake_batch
:param mask: Tensor, actual mask, which was at input of generator when making fake_batch
:return: total discriminator loss along with some values that might be interesting to log
"""
raise NotImplementedError
def interpolate_mask(self, mask, shape):
assert mask is not None
assert self.allow_scale_mask or shape == mask.shape[-2:]
if shape != mask.shape[-2:] and self.allow_scale_mask:
if self.mask_scale_mode == 'maxpool':
mask = F.adaptive_max_pool2d(mask, shape)
else:
mask = F.interpolate(
mask, size=shape, mode=self.mask_scale_mode)
return mask
def make_r1_gp(discr_real_pred, real_batch):
if torch.is_grad_enabled():
grad_real = torch.autograd.grad(
outputs=discr_real_pred.sum(),
inputs=real_batch,
create_graph=True)[0]
grad_penalty = (grad_real.view(grad_real.shape[0],
-1).norm(2, dim=1)**2).mean()
else:
grad_penalty = 0
real_batch.requires_grad = False
return grad_penalty
class NonSaturatingWithR1(BaseAdversarialLoss):
def __init__(self,
gp_coef=5,
weight=1,
mask_as_fake_target=False,
allow_scale_mask=False,
mask_scale_mode='nearest',
extra_mask_weight_for_gen=0,
use_unmasked_for_gen=True,
use_unmasked_for_discr=True):
self.gp_coef = gp_coef
self.weight = weight
# use for discr => use for gen;
# otherwise we teach only the discr to pay attention to very small difference
assert use_unmasked_for_gen or (not use_unmasked_for_discr)
# mask as target => use unmasked for discr:
# if we don't care about unmasked regions at all
# then it doesn't matter if the value of mask_as_fake_target is true or false
assert use_unmasked_for_discr or (not mask_as_fake_target)
self.use_unmasked_for_gen = use_unmasked_for_gen
self.use_unmasked_for_discr = use_unmasked_for_discr
self.mask_as_fake_target = mask_as_fake_target
self.allow_scale_mask = allow_scale_mask
self.mask_scale_mode = mask_scale_mode
self.extra_mask_weight_for_gen = extra_mask_weight_for_gen
def generator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
mask=None) \
-> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
fake_loss = F.softplus(-discr_fake_pred)
if (self.mask_as_fake_target and self.extra_mask_weight_for_gen > 0) or \
not self.use_unmasked_for_gen: # == if masked region should be treated differently
mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
if not self.use_unmasked_for_gen:
fake_loss = fake_loss * mask
else:
pixel_weights = 1 + mask * self.extra_mask_weight_for_gen
fake_loss = fake_loss * pixel_weights
return fake_loss.mean() * self.weight, dict()
def pre_discriminator_step(self, real_batch: torch.Tensor,
fake_batch: torch.Tensor, generator: nn.Module,
discriminator: nn.Module):
real_batch.requires_grad = True
def discriminator_loss(self, real_batch: torch.Tensor, fake_batch: torch.Tensor,
discr_real_pred: torch.Tensor, discr_fake_pred: torch.Tensor,
mask=None) \
-> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
real_loss = F.softplus(-discr_real_pred)
grad_penalty = make_r1_gp(discr_real_pred, real_batch) * self.gp_coef
fake_loss = F.softplus(discr_fake_pred)
if not self.use_unmasked_for_discr or self.mask_as_fake_target:
# == if masked region should be treated differently
mask = self.interpolate_mask(mask, discr_fake_pred.shape[-2:])
# use_unmasked_for_discr=False only makes sense for fakes;
# for reals there is no difference beetween two regions
fake_loss = fake_loss * mask
if self.mask_as_fake_target:
fake_loss = fake_loss + (1
- mask) * F.softplus(-discr_fake_pred)
sum_discr_loss = real_loss + grad_penalty + fake_loss
metrics = dict(
discr_real_out=discr_real_pred.mean(),
discr_fake_out=discr_fake_pred.mean(),
discr_real_gp=grad_penalty)
return sum_discr_loss.mean(), metrics

View File

@@ -0,0 +1,45 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
from typing import List
import torch
import torch.nn.functional as F
def masked_l2_loss(pred, target, mask, weight_known, weight_missing):
per_pixel_l2 = F.mse_loss(pred, target, reduction='none')
pixel_weights = mask * weight_missing + (1 - mask) * weight_known
return (pixel_weights * per_pixel_l2).mean()
def masked_l1_loss(pred, target, mask, weight_known, weight_missing):
per_pixel_l1 = F.l1_loss(pred, target, reduction='none')
pixel_weights = mask * weight_missing + (1 - mask) * weight_known
return (pixel_weights * per_pixel_l1).mean()
def feature_matching_loss(fake_features: List[torch.Tensor],
target_features: List[torch.Tensor],
mask=None):
if mask is None:
res = torch.stack([
F.mse_loss(fake_feat, target_feat)
for fake_feat, target_feat in zip(fake_features, target_features)
]).mean()
else:
res = 0
norm = 0
for fake_feat, target_feat in zip(fake_features, target_features):
cur_mask = F.interpolate(
mask,
size=fake_feat.shape[-2:],
mode='bilinear',
align_corners=False)
error_weights = 1 - cur_mask
cur_val = ((fake_feat - target_feat).pow(2) * error_weights).mean()
res = res + cur_val
norm += 1
res = res / norm
return res

View File

@@ -0,0 +1,588 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
from kornia.geometry.transform import rotate
def get_activation(kind='tanh'):
if kind == 'tanh':
return nn.Tanh()
if kind == 'sigmoid':
return nn.Sigmoid()
if kind is False:
return nn.Identity()
raise ValueError(f'Unknown activation kind {kind}')
class SELayer(nn.Module):
def __init__(self, channel, reduction=16):
super(SELayer, self).__init__()
self.avg_pool = nn.AdaptiveAvgPool2d(1)
self.fc = nn.Sequential(
nn.Linear(channel, channel // reduction, bias=False),
nn.ReLU(inplace=True),
nn.Linear(channel // reduction, channel, bias=False), nn.Sigmoid())
def forward(self, x):
b, c, _, _ = x.size()
y = self.avg_pool(x).view(b, c)
y = self.fc(y).view(b, c, 1, 1)
res = x * y.expand_as(x)
return res
class FourierUnit(nn.Module):
def __init__(self,
in_channels,
out_channels,
groups=1,
spatial_scale_factor=None,
spatial_scale_mode='bilinear',
spectral_pos_encoding=False,
use_se=False,
se_kwargs=None,
ffc3d=False,
fft_norm='ortho'):
# bn_layer not used
super(FourierUnit, self).__init__()
self.groups = groups
self.conv_layer = torch.nn.Conv2d(
in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0),
out_channels=out_channels * 2,
kernel_size=1,
stride=1,
padding=0,
groups=self.groups,
bias=False)
self.bn = torch.nn.BatchNorm2d(out_channels * 2)
self.relu = torch.nn.ReLU(inplace=True)
# squeeze and excitation block
self.use_se = use_se
if use_se:
if se_kwargs is None:
se_kwargs = {}
self.se = SELayer(self.conv_layer.in_channels, **se_kwargs)
self.spatial_scale_factor = spatial_scale_factor
self.spatial_scale_mode = spatial_scale_mode
self.spectral_pos_encoding = spectral_pos_encoding
self.ffc3d = ffc3d
self.fft_norm = fft_norm
def forward(self, x):
batch = x.shape[0]
if self.spatial_scale_factor is not None:
orig_size = x.shape[-2:]
x = F.interpolate(
x,
scale_factor=self.spatial_scale_factor,
mode=self.spatial_scale_mode,
align_corners=False)
# (batch, c, h, w/2+1, 2)
fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1)
ffted = torch.fft.rfftn(x, dim=fft_dim, norm=self.fft_norm)
ffted = torch.stack((ffted.real, ffted.imag), dim=-1)
ffted = ffted.permute(0, 1, 4, 2,
3).contiguous() # (batch, c, 2, h, w/2+1)
ffted = ffted.view((
batch,
-1,
) + ffted.size()[3:])
if self.spectral_pos_encoding:
height, width = ffted.shape[-2:]
coords_vert = torch.linspace(0, 1,
height)[None, None, :, None].expand(
batch, 1, height, width).to(ffted)
coords_hor = torch.linspace(0, 1,
width)[None, None, None, :].expand(
batch, 1, height, width).to(ffted)
ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1)
if self.use_se:
ffted = self.se(ffted)
ffted = self.conv_layer(ffted) # (batch, c*2, h, w/2+1)
ffted = self.relu(self.bn(ffted))
ffted = ffted.view((
batch,
-1,
2,
) + ffted.size()[2:]).permute(
0, 1, 3, 4, 2).contiguous() # (batch,c, t, h, w/2+1, 2)
ffted = torch.complex(ffted[..., 0], ffted[..., 1])
ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:]
output = torch.fft.irfftn(
ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm)
if self.spatial_scale_factor is not None:
output = F.interpolate(
output,
size=orig_size,
mode=self.spatial_scale_mode,
align_corners=False)
return output
class SpectralTransform(nn.Module):
def __init__(self,
in_channels,
out_channels,
stride=1,
groups=1,
enable_lfu=True,
**fu_kwargs):
# bn_layer not used
super(SpectralTransform, self).__init__()
self.enable_lfu = enable_lfu
if stride == 2:
self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2)
else:
self.downsample = nn.Identity()
self.stride = stride
self.conv1 = nn.Sequential(
nn.Conv2d(
in_channels,
out_channels // 2,
kernel_size=1,
groups=groups,
bias=False), nn.BatchNorm2d(out_channels // 2),
nn.ReLU(inplace=True))
self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups,
**fu_kwargs)
if self.enable_lfu:
self.lfu = FourierUnit(out_channels // 2, out_channels // 2,
groups)
self.conv2 = torch.nn.Conv2d(
out_channels // 2,
out_channels,
kernel_size=1,
groups=groups,
bias=False)
def forward(self, x):
x = self.downsample(x)
x = self.conv1(x)
output = self.fu(x)
if self.enable_lfu:
n, c, h, w = x.shape
split_no = 2
split_s = h // split_no
xs = torch.cat(
torch.split(x[:, :c // 4], split_s, dim=-2),
dim=1).contiguous()
xs = torch.cat(
torch.split(xs, split_s, dim=-1), dim=1).contiguous()
xs = self.lfu(xs)
xs = xs.repeat(1, 1, split_no, split_no).contiguous()
else:
xs = 0
output = self.conv2(x + output + xs)
return output
class LearnableSpatialTransformWrapper(nn.Module):
def __init__(self,
impl,
pad_coef=0.5,
angle_init_range=80,
train_angle=True):
super().__init__()
self.impl = impl
self.angle = torch.rand(1) * angle_init_range
if train_angle:
self.angle = nn.Parameter(self.angle, requires_grad=True)
self.pad_coef = pad_coef
def forward(self, x):
if torch.is_tensor(x):
return self.inverse_transform(self.impl(self.transform(x)), x)
elif isinstance(x, tuple):
x_trans = tuple(self.transform(elem) for elem in x)
y_trans = self.impl(x_trans)
return tuple(
self.inverse_transform(elem, orig_x)
for elem, orig_x in zip(y_trans, x))
else:
raise ValueError(f'Unexpected input type {type(x)}')
def transform(self, x):
height, width = x.shape[2:]
pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
x_padded = F.pad(x, [pad_w, pad_w, pad_h, pad_h], mode='reflect')
x_padded_rotated = rotate(x_padded, angle=self.angle.to(x_padded))
return x_padded_rotated
def inverse_transform(self, y_padded_rotated, orig_x):
height, width = orig_x.shape[2:]
pad_h, pad_w = int(height * self.pad_coef), int(width * self.pad_coef)
y_padded = rotate(
y_padded_rotated, angle=-self.angle.to(y_padded_rotated))
y_height, y_width = y_padded.shape[2:]
y = y_padded[:, :, pad_h:y_height - pad_h, pad_w:y_width - pad_w]
return y
class FFC(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
ratio_gin,
ratio_gout,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=False,
enable_lfu=True,
padding_type='reflect',
gated=False,
**spectral_kwargs):
super(FFC, self).__init__()
assert stride == 1 or stride == 2, 'Stride should be 1 or 2.'
self.stride = stride
in_cg = int(in_channels * ratio_gin)
in_cl = in_channels - in_cg
out_cg = int(out_channels * ratio_gout)
out_cl = out_channels - out_cg
self.ratio_gin = ratio_gin
self.ratio_gout = ratio_gout
self.global_in_num = in_cg
module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d
self.convl2l = module(
in_cl,
out_cl,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
padding_mode=padding_type)
module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d
self.convl2g = module(
in_cl,
out_cg,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
padding_mode=padding_type)
module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d
self.convg2l = module(
in_cg,
out_cl,
kernel_size,
stride,
padding,
dilation,
groups,
bias,
padding_mode=padding_type)
module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform
self.convg2g = module(in_cg, out_cg, stride,
1 if groups == 1 else groups // 2, enable_lfu,
**spectral_kwargs)
self.gated = gated
module = nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d
self.gate = module(in_channels, 2, 1)
def forward(self, x):
x_l, x_g = x if type(x) is tuple else (x, 0)
out_xl, out_xg = 0, 0
if self.gated:
total_input_parts = [x_l]
if torch.is_tensor(x_g):
total_input_parts.append(x_g)
total_input = torch.cat(total_input_parts, dim=1)
gates = torch.sigmoid(self.gate(total_input))
g2l_gate, l2g_gate = gates.chunk(2, dim=1)
else:
g2l_gate, l2g_gate = 1, 1
if self.ratio_gout != 1:
out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate
if self.ratio_gout != 0:
out_xg = self.convl2g(x_l) * l2g_gate + self.convg2g(x_g)
return out_xl, out_xg
class FFC_BN_ACT(nn.Module):
def __init__(self,
in_channels,
out_channels,
kernel_size,
ratio_gin,
ratio_gout,
stride=1,
padding=0,
dilation=1,
groups=1,
bias=False,
norm_layer=nn.BatchNorm2d,
activation_layer=nn.Identity,
padding_type='reflect',
enable_lfu=True,
**kwargs):
super(FFC_BN_ACT, self).__init__()
self.ffc = FFC(
in_channels,
out_channels,
kernel_size,
ratio_gin,
ratio_gout,
stride,
padding,
dilation,
groups,
bias,
enable_lfu,
padding_type=padding_type,
**kwargs)
lnorm = nn.Identity if ratio_gout == 1 else norm_layer
gnorm = nn.Identity if ratio_gout == 0 else norm_layer
global_channels = int(out_channels * ratio_gout)
self.bn_l = lnorm(out_channels - global_channels)
self.bn_g = gnorm(global_channels)
lact = nn.Identity if ratio_gout == 1 else activation_layer
gact = nn.Identity if ratio_gout == 0 else activation_layer
self.act_l = lact(inplace=True)
self.act_g = gact(inplace=True)
def forward(self, x):
x_l, x_g = self.ffc(x)
x_l = self.act_l(self.bn_l(x_l))
x_g = self.act_g(self.bn_g(x_g))
return x_l, x_g
class FFCResnetBlock(nn.Module):
def __init__(self,
dim,
padding_type,
norm_layer,
activation_layer=nn.ReLU,
dilation=1,
spatial_transform_kwargs=None,
inline=False,
**conv_kwargs):
super().__init__()
self.conv1 = FFC_BN_ACT(
dim,
dim,
kernel_size=3,
padding=dilation,
dilation=dilation,
norm_layer=norm_layer,
activation_layer=activation_layer,
padding_type=padding_type,
**conv_kwargs)
self.conv2 = FFC_BN_ACT(
dim,
dim,
kernel_size=3,
padding=dilation,
dilation=dilation,
norm_layer=norm_layer,
activation_layer=activation_layer,
padding_type=padding_type,
**conv_kwargs)
if spatial_transform_kwargs is not None:
self.conv1 = LearnableSpatialTransformWrapper(
self.conv1, **spatial_transform_kwargs)
self.conv2 = LearnableSpatialTransformWrapper(
self.conv2, **spatial_transform_kwargs)
self.inline = inline
def forward(self, x):
if self.inline:
x_l, x_g = x[:, :-self.conv1.ffc.
global_in_num], x[:, -self.conv1.ffc.global_in_num:]
else:
x_l, x_g = x if type(x) is tuple else (x, 0)
id_l, id_g = x_l, x_g
x_l, x_g = self.conv1((x_l, x_g))
x_l, x_g = self.conv2((x_l, x_g))
x_l, x_g = id_l + x_l, id_g + x_g
out = x_l, x_g
if self.inline:
out = torch.cat(out, dim=1)
return out
class ConcatTupleLayer(nn.Module):
def forward(self, x):
assert isinstance(x, tuple)
x_l, x_g = x
assert torch.is_tensor(x_l) or torch.is_tensor(x_g)
if not torch.is_tensor(x_g):
return x_l
return torch.cat(x, dim=1)
class FFCResNetGenerator(nn.Module):
def __init__(self,
input_nc=4,
output_nc=3,
ngf=64,
n_downsampling=3,
n_blocks=18,
norm_layer=nn.BatchNorm2d,
padding_type='reflect',
activation_layer=nn.ReLU,
up_norm_layer=nn.BatchNorm2d,
up_activation=nn.ReLU(True),
init_conv_kwargs={
'ratio_gin': 0,
'ratio_gout': 0,
'enable_lfu': False
},
downsample_conv_kwargs={
'ratio_gin': 0,
'ratio_gout': 0,
'enable_lfu': False
},
resnet_conv_kwargs={
'ratio_gin': 0.75,
'ratio_gout': 0.75,
'enable_lfu': False
},
spatial_transform_layers=None,
spatial_transform_kwargs={},
add_out_act='sigmoid',
max_features=1024,
out_ffc=False,
out_ffc_kwargs={}):
assert (n_blocks >= 0)
super().__init__()
model = [
nn.ReflectionPad2d(3),
FFC_BN_ACT(
input_nc,
ngf,
kernel_size=7,
padding=0,
norm_layer=norm_layer,
activation_layer=activation_layer,
**init_conv_kwargs)
]
# downsample
for i in range(n_downsampling):
mult = 2**i
if i == n_downsampling - 1:
cur_conv_kwargs = dict(downsample_conv_kwargs)
cur_conv_kwargs['ratio_gout'] = resnet_conv_kwargs.get(
'ratio_gin', 0)
else:
cur_conv_kwargs = downsample_conv_kwargs
model += [
FFC_BN_ACT(
min(max_features, ngf * mult),
min(max_features, ngf * mult * 2),
kernel_size=3,
stride=2,
padding=1,
norm_layer=norm_layer,
activation_layer=activation_layer,
**cur_conv_kwargs)
]
mult = 2**n_downsampling
feats_num_bottleneck = min(max_features, ngf * mult)
# resnet blocks
for i in range(n_blocks):
cur_resblock = FFCResnetBlock(
feats_num_bottleneck,
padding_type=padding_type,
activation_layer=activation_layer,
norm_layer=norm_layer,
**resnet_conv_kwargs)
if spatial_transform_layers is not None and i in spatial_transform_layers:
cur_resblock = LearnableSpatialTransformWrapper(
cur_resblock, **spatial_transform_kwargs)
model += [cur_resblock]
model += [ConcatTupleLayer()]
# upsample
for i in range(n_downsampling):
mult = 2**(n_downsampling - i)
model += [
nn.ConvTranspose2d(
min(max_features, ngf * mult),
min(max_features, int(ngf * mult / 2)),
kernel_size=3,
stride=2,
padding=1,
output_padding=1),
up_norm_layer(min(max_features, int(ngf * mult / 2))),
up_activation
]
if out_ffc:
model += [
FFCResnetBlock(
ngf,
padding_type=padding_type,
activation_layer=activation_layer,
norm_layer=norm_layer,
inline=True,
**out_ffc_kwargs)
]
model += [
nn.ReflectionPad2d(3),
nn.Conv2d(ngf, output_nc, kernel_size=7, padding=0)
]
if add_out_act:
model.append(
get_activation('tanh' if add_out_act is True else add_out_act))
self.model = nn.Sequential(*model)
def forward(self, input):
return self.model(input)

View File

@@ -0,0 +1,324 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
from torchvision import models
from modelscope.utils.logger import get_logger
try:
from torchvision.models.utils import load_state_dict_from_url
except ImportError:
from torch.utils.model_zoo import load_url as load_state_dict_from_url
# Inception weights ported to Pytorch from
# http://download.tensorflow.org/models/image/imagenet/inception-2015-12-05.tgz
FID_WEIGHTS_URL = 'https://github.com/mseitzer/pytorch-fid/releases/download/' \
'fid_weights/pt_inception-2015-12-05-6726825d.pth'
LOGGER = get_logger()
class InceptionV3(nn.Module):
"""Pretrained InceptionV3 network returning feature maps"""
# Index of default block of inception to return,
# corresponds to output of final average pooling
DEFAULT_BLOCK_INDEX = 3
# Maps feature dimensionality to their output blocks indices
BLOCK_INDEX_BY_DIM = {
64: 0, # First max pooling features
192: 1, # Second max pooling featurs
768: 2, # Pre-aux classifier features
2048: 3 # Final average pooling features
}
def __init__(self,
output_blocks=[DEFAULT_BLOCK_INDEX],
resize_input=True,
normalize_input=True,
requires_grad=False,
use_fid_inception=True):
"""Build pretrained InceptionV3
Parameters
----------
output_blocks : list of int
Indices of blocks to return features of. Possible values are:
- 0: corresponds to output of first max pooling
- 1: corresponds to output of second max pooling
- 2: corresponds to output which is fed to aux classifier
- 3: corresponds to output of final average pooling
resize_input : bool
If true, bilinearly resizes input to width and height 299 before
feeding input to model. As the network without fully connected
layers is fully convolutional, it should be able to handle inputs
of arbitrary size, so resizing might not be strictly needed
normalize_input : bool
If true, scales the input from range (0, 1) to the range the
pretrained Inception network expects, namely (-1, 1)
requires_grad : bool
If true, parameters of the model require gradients. Possibly useful
for finetuning the network
use_fid_inception : bool
If true, uses the pretrained Inception model used in Tensorflow's
FID implementation. If false, uses the pretrained Inception model
available in torchvision. The FID Inception model has different
weights and a slightly different structure from torchvision's
Inception model. If you want to compute FID scores, you are
strongly advised to set this parameter to true to get comparable
results.
"""
super(InceptionV3, self).__init__()
self.resize_input = resize_input
self.normalize_input = normalize_input
self.output_blocks = sorted(output_blocks)
self.last_needed_block = max(output_blocks)
assert self.last_needed_block <= 3, \
'Last possible output block index is 3'
self.blocks = nn.ModuleList()
if use_fid_inception:
inception = fid_inception_v3()
else:
inception = models.inception_v3(pretrained=True)
# Block 0: input to maxpool1
block0 = [
inception.Conv2d_1a_3x3, inception.Conv2d_2a_3x3,
inception.Conv2d_2b_3x3,
nn.MaxPool2d(kernel_size=3, stride=2)
]
self.blocks.append(nn.Sequential(*block0))
# Block 1: maxpool1 to maxpool2
if self.last_needed_block >= 1:
block1 = [
inception.Conv2d_3b_1x1, inception.Conv2d_4a_3x3,
nn.MaxPool2d(kernel_size=3, stride=2)
]
self.blocks.append(nn.Sequential(*block1))
# Block 2: maxpool2 to aux classifier
if self.last_needed_block >= 2:
block2 = [
inception.Mixed_5b,
inception.Mixed_5c,
inception.Mixed_5d,
inception.Mixed_6a,
inception.Mixed_6b,
inception.Mixed_6c,
inception.Mixed_6d,
inception.Mixed_6e,
]
self.blocks.append(nn.Sequential(*block2))
# Block 3: aux classifier to final avgpool
if self.last_needed_block >= 3:
block3 = [
inception.Mixed_7a, inception.Mixed_7b, inception.Mixed_7c,
nn.AdaptiveAvgPool2d(output_size=(1, 1))
]
self.blocks.append(nn.Sequential(*block3))
for param in self.parameters():
param.requires_grad = requires_grad
def forward(self, inp):
"""Get Inception feature maps
Parameters
----------
inp : torch.autograd.Variable
Input tensor of shape Bx3xHxW. Values are expected to be in
range (0, 1)
Returns
-------
List of torch.autograd.Variable, corresponding to the selected output
block, sorted ascending by index
"""
outp = []
x = inp
if self.resize_input:
x = F.interpolate(
x, size=(299, 299), mode='bilinear', align_corners=False)
if self.normalize_input:
x = 2 * x - 1 # Scale from range (0, 1) to range (-1, 1)
for idx, block in enumerate(self.blocks):
x = block(x)
if idx in self.output_blocks:
outp.append(x)
if idx == self.last_needed_block:
break
return outp
def fid_inception_v3():
"""Build pretrained Inception model for FID computation
The Inception model for FID computation uses a different set of weights
and has a slightly different structure than torchvision's Inception.
This method first constructs torchvision's Inception and then patches the
necessary parts that are different in the FID Inception model.
"""
LOGGER.info('fid_inception_v3 called')
inception = models.inception_v3(
num_classes=1008, aux_logits=False, pretrained=False)
LOGGER.info('models.inception_v3 done')
inception.Mixed_5b = FIDInceptionA(192, pool_features=32)
inception.Mixed_5c = FIDInceptionA(256, pool_features=64)
inception.Mixed_5d = FIDInceptionA(288, pool_features=64)
inception.Mixed_6b = FIDInceptionC(768, channels_7x7=128)
inception.Mixed_6c = FIDInceptionC(768, channels_7x7=160)
inception.Mixed_6d = FIDInceptionC(768, channels_7x7=160)
inception.Mixed_6e = FIDInceptionC(768, channels_7x7=192)
inception.Mixed_7b = FIDInceptionE_1(1280)
inception.Mixed_7c = FIDInceptionE_2(2048)
LOGGER.info('fid_inception_v3 patching done')
state_dict = load_state_dict_from_url(FID_WEIGHTS_URL, progress=True)
LOGGER.info('fid_inception_v3 weights downloaded')
inception.load_state_dict(state_dict)
LOGGER.info('fid_inception_v3 weights loaded into model')
return inception
class FIDInceptionA(models.inception.InceptionA):
"""InceptionA block patched for FID computation"""
def __init__(self, in_channels, pool_features):
super(FIDInceptionA, self).__init__(in_channels, pool_features)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch5x5 = self.branch5x5_1(x)
branch5x5 = self.branch5x5_2(branch5x5)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = self.branch3x3dbl_3(branch3x3dbl)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(
x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionC(models.inception.InceptionC):
"""InceptionC block patched for FID computation"""
def __init__(self, in_channels, channels_7x7):
super(FIDInceptionC, self).__init__(in_channels, channels_7x7)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch7x7 = self.branch7x7_1(x)
branch7x7 = self.branch7x7_2(branch7x7)
branch7x7 = self.branch7x7_3(branch7x7)
branch7x7dbl = self.branch7x7dbl_1(x)
branch7x7dbl = self.branch7x7dbl_2(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_3(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_4(branch7x7dbl)
branch7x7dbl = self.branch7x7dbl_5(branch7x7dbl)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(
x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch7x7, branch7x7dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionE_1(models.inception.InceptionE):
"""First InceptionE block patched for FID computation"""
def __init__(self, in_channels):
super(FIDInceptionE_1, self).__init__(in_channels)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch3x3 = self.branch3x3_1(x)
branch3x3 = [
self.branch3x3_2a(branch3x3),
self.branch3x3_2b(branch3x3),
]
branch3x3 = torch.cat(branch3x3, 1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = [
self.branch3x3dbl_3a(branch3x3dbl),
self.branch3x3dbl_3b(branch3x3dbl),
]
branch3x3dbl = torch.cat(branch3x3dbl, 1)
# Patch: Tensorflow's average pool does not use the padded zero's in
# its average calculation
branch_pool = F.avg_pool2d(
x, kernel_size=3, stride=1, padding=1, count_include_pad=False)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
class FIDInceptionE_2(models.inception.InceptionE):
"""Second InceptionE block patched for FID computation"""
def __init__(self, in_channels):
super(FIDInceptionE_2, self).__init__(in_channels)
def forward(self, x):
branch1x1 = self.branch1x1(x)
branch3x3 = self.branch3x3_1(x)
branch3x3 = [
self.branch3x3_2a(branch3x3),
self.branch3x3_2b(branch3x3),
]
branch3x3 = torch.cat(branch3x3, 1)
branch3x3dbl = self.branch3x3dbl_1(x)
branch3x3dbl = self.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = [
self.branch3x3dbl_3a(branch3x3dbl),
self.branch3x3dbl_3b(branch3x3dbl),
]
branch3x3dbl = torch.cat(branch3x3dbl, 1)
# Patch: The FID Inception model uses max pooling instead of average
# pooling. This is likely an error in this specific Inception
# implementation, as other Inception models use average pooling here
# (which matches the description in the paper).
branch_pool = F.max_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = self.branch_pool(branch_pool)
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)

View File

@@ -0,0 +1,47 @@
"""
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
"""
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from .ade20k import ModelBuilder
IMAGENET_MEAN = torch.FloatTensor([0.485, 0.456, 0.406])[None, :, None, None]
IMAGENET_STD = torch.FloatTensor([0.229, 0.224, 0.225])[None, :, None, None]
class ResNetPL(nn.Module):
def __init__(self,
weight=1,
weights_path=None,
arch_encoder='resnet50dilated',
segmentation=True):
super().__init__()
self.impl = ModelBuilder.get_encoder(
weights_path=weights_path,
arch_encoder=arch_encoder,
arch_decoder='ppm_deepsup',
fc_dim=2048,
segmentation=segmentation)
self.impl.eval()
for w in self.impl.parameters():
w.requires_grad_(False)
self.weight = weight
def forward(self, pred, target):
pred = (pred - IMAGENET_MEAN.to(pred)) / IMAGENET_STD.to(pred)
target = (target - IMAGENET_MEAN.to(target)) / IMAGENET_STD.to(target)
pred_feats = self.impl(pred, return_feature_maps=True)
target_feats = self.impl(target, return_feature_maps=True)
result = torch.stack([
F.mse_loss(cur_pred, cur_target)
for cur_pred, cur_target in zip(pred_feats, target_feats)
]).sum() * self.weight
return result

View File

@@ -0,0 +1,75 @@
"""
The implementation is adopted from
https://github.com/NVIDIA/pix2pixHD/blob/master/models/networks.py
"""
import collections
import functools
import logging
from collections import defaultdict
from functools import partial
import numpy as np
import torch.nn as nn
# Defines the PatchGAN discriminator with the specified arguments.
class NLayerDiscriminator(nn.Module):
def __init__(
self,
input_nc=3,
ndf=64,
n_layers=4,
norm_layer=nn.BatchNorm2d,
):
super().__init__()
self.n_layers = n_layers
kw = 4
padw = int(np.ceil((kw - 1.0) / 2))
sequence = [[
nn.Conv2d(input_nc, ndf, kernel_size=kw, stride=2, padding=padw),
nn.LeakyReLU(0.2, True)
]]
nf = ndf
for n in range(1, n_layers):
nf_prev = nf
nf = min(nf * 2, 512)
cur_model = []
cur_model += [
nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=2, padding=padw),
norm_layer(nf),
nn.LeakyReLU(0.2, True)
]
sequence.append(cur_model)
nf_prev = nf
nf = min(nf * 2, 512)
cur_model = []
cur_model += [
nn.Conv2d(nf_prev, nf, kernel_size=kw, stride=1, padding=padw),
norm_layer(nf),
nn.LeakyReLU(0.2, True)
]
sequence.append(cur_model)
sequence += [[
nn.Conv2d(nf, 1, kernel_size=kw, stride=1, padding=padw)
]]
for n in range(len(sequence)):
setattr(self, 'model' + str(n), nn.Sequential(*sequence[n]))
def get_all_activations(self, x):
res = [x]
for n in range(self.n_layers + 2):
model = getattr(self, 'model' + str(n))
res.append(model(res[-1]))
return res[1:]
def forward(self, x):
act = self.get_all_activations(x)
return act[-1], act[:-1]

View File

@@ -0,0 +1,393 @@
'''
Part of the implementation is borrowed and modified from LaMa, publicly available at
https://github.com/saic-mdal/lama
'''
import cv2
import numpy as np
import torch
import torch.nn as nn
from kornia.filters import gaussian_blur2d
from kornia.geometry.transform import resize
from kornia.morphology import erosion
from torch.nn import functional as F
from torch.optim import SGD, Adam
from tqdm import tqdm
from .modules.ffc import FFCResnetBlock
def move_to_device(obj, device):
if isinstance(obj, nn.Module):
return obj.to(device)
if torch.is_tensor(obj):
return obj.to(device)
if isinstance(obj, (tuple, list)):
return [move_to_device(el, device) for el in obj]
if isinstance(obj, dict):
return {name: move_to_device(val, device) for name, val in obj.items()}
raise ValueError(f'Unexpected type {type(obj)}')
def ceil_modulo(x, mod):
if x % mod == 0:
return x
return (x // mod + 1) * mod
def pad_tensor_to_modulo(img, mod):
batch_size, channels, height, width = img.shape
out_height = ceil_modulo(height, mod)
out_width = ceil_modulo(width, mod)
return F.pad(
img,
pad=(0, out_width - width, 0, out_height - height),
mode='reflect')
def _pyrdown(im: torch.Tensor, downsize: tuple = None):
"""downscale the image"""
if downsize is None:
downsize = (im.shape[2] // 2, im.shape[3] // 2)
assert im.shape[
1] == 3, 'Expected shape for the input to be (n,3,height,width)'
im = gaussian_blur2d(im, kernel_size=(5, 5), sigma=(1.0, 1.0))
im = F.interpolate(im, size=downsize, mode='bilinear', align_corners=False)
return im
def _pyrdown_mask(mask: torch.Tensor,
downsize: tuple = None,
eps: float = 1e-8,
blur_mask: bool = True,
round_up: bool = True):
"""downscale the mask tensor
Parameters
----------
mask : torch.Tensor
mask of size (B, 1, H, W)
downsize : tuple, optional
size to downscale to. If None, image is downscaled to half, by default None
eps : float, optional
threshold value for binarizing the mask, by default 1e-8
blur_mask : bool, optional
if True, apply gaussian filter before downscaling, by default True
round_up : bool, optional
if True, values above eps are marked 1, else, values below 1-eps are marked 0, by default True
Returns
-------
torch.Tensor
downscaled mask
"""
if downsize is None:
downsize = (mask.shape[2] // 2, mask.shape[3] // 2)
assert mask.shape[
1] == 1, 'Expected shape for the input to be (n,1,height,width)'
if blur_mask is True:
mask = gaussian_blur2d(mask, kernel_size=(5, 5), sigma=(1.0, 1.0))
mask = F.interpolate(
mask, size=downsize, mode='bilinear', align_corners=False)
else:
mask = F.interpolate(
mask, size=downsize, mode='bilinear', align_corners=False)
if round_up:
mask[mask >= eps] = 1
mask[mask < eps] = 0
else:
mask[mask >= 1.0 - eps] = 1
mask[mask < 1.0 - eps] = 0
return mask
def _erode_mask(mask: torch.Tensor,
ekernel: torch.Tensor = None,
eps: float = 1e-8):
"""erode the mask, and set gray pixels to 0"""
if ekernel is not None:
mask = erosion(mask, ekernel)
mask[mask >= 1.0 - eps] = 1
mask[mask < 1.0 - eps] = 0
return mask
def _l1_loss(pred: torch.Tensor,
pred_downscaled: torch.Tensor,
ref: torch.Tensor,
mask: torch.Tensor,
mask_downscaled: torch.Tensor,
image: torch.Tensor,
on_pred: bool = True):
"""l1 loss on src pixels, and downscaled predictions if on_pred=True"""
loss = torch.mean(torch.abs(pred[mask < 1e-8] - image[mask < 1e-8]))
if on_pred:
loss += torch.mean(
torch.abs(pred_downscaled[mask_downscaled >= 1e-8]
- ref[mask_downscaled >= 1e-8]))
return loss
def _infer(image: torch.Tensor,
mask: torch.Tensor,
forward_front: nn.Module,
forward_rears: nn.Module,
ref_lower_res: torch.Tensor,
orig_shape: tuple,
devices: list,
scale_ind: int,
n_iters: int = 15,
lr: float = 0.002):
"""Performs inference with refinement at a given scale.
Parameters
----------
image : torch.Tensor
input image to be inpainted, of size (1,3,H,W)
mask : torch.Tensor
input inpainting mask, of size (1,1,H,W)
forward_front : nn.Module
the front part of the inpainting network
forward_rears : nn.Module
the rear part of the inpainting network
ref_lower_res : torch.Tensor
the inpainting at previous scale, used as reference image
orig_shape : tuple
shape of the original input image before padding
devices : list
list of available devices
scale_ind : int
the scale index
n_iters : int, optional
number of iterations of refinement, by default 15
lr : float, optional
learning rate, by default 0.002
Returns
-------
torch.Tensor
inpainted image
"""
masked_image = image * (1 - mask)
masked_image = torch.cat([masked_image, mask], dim=1)
mask = mask.repeat(1, 3, 1, 1)
if ref_lower_res is not None:
ref_lower_res = ref_lower_res.detach()
with torch.no_grad():
z1, z2 = forward_front(masked_image)
# Inference
mask = mask.to(devices[-1])
ekernel = torch.from_numpy(
cv2.getStructuringElement(cv2.MORPH_ELLIPSE,
(15, 15)).astype(bool)).float()
ekernel = ekernel.to(devices[-1])
image = image.to(devices[-1])
z1, z2 = z1.detach().to(devices[0]), z2.detach().to(devices[0])
z1.requires_grad, z2.requires_grad = True, True
optimizer = Adam([z1, z2], lr=lr)
pbar = tqdm(range(n_iters), leave=False)
for idi in pbar:
optimizer.zero_grad()
input_feat = (z1, z2)
for idd, forward_rear in enumerate(forward_rears):
output_feat = forward_rear(input_feat)
if idd < len(devices) - 1:
midz1, midz2 = output_feat
midz1, midz2 = midz1.to(devices[idd + 1]), midz2.to(
devices[idd + 1])
input_feat = (midz1, midz2)
else:
pred = output_feat
if ref_lower_res is None:
break
losses = {}
# scaled loss with downsampler
pred_downscaled = _pyrdown(pred[:, :, :orig_shape[0], :orig_shape[1]])
mask_downscaled = _pyrdown_mask(
mask[:, :1, :orig_shape[0], :orig_shape[1]],
blur_mask=False,
round_up=False)
mask_downscaled = _erode_mask(mask_downscaled, ekernel=ekernel)
mask_downscaled = mask_downscaled.repeat(1, 3, 1, 1)
losses['ms_l1'] = _l1_loss(
pred,
pred_downscaled,
ref_lower_res,
mask,
mask_downscaled,
image,
on_pred=True)
loss = sum(losses.values())
pbar.set_description(
'Refining scale {} using scale {} ...current loss: {:.4f}'.format(
scale_ind + 1, scale_ind, loss.item()))
if idi < n_iters - 1:
loss.backward()
optimizer.step()
del pred_downscaled
del loss
del pred
# "pred" is the prediction after Plug-n-Play module
inpainted = mask * pred + (1 - mask) * image
inpainted = inpainted.detach().cpu()
return inpainted
def _get_image_mask_pyramid(batch: dict, min_side: int, max_scales: int,
px_budget: int):
"""Build the image mask pyramid
Parameters
----------
batch : dict
batch containing image, mask, etc
min_side : int
minimum side length to limit the number of scales of the pyramid
max_scales : int
maximum number of scales allowed
px_budget : int
the product H*W cannot exceed this budget, because of resource constraints
Returns
-------
tuple
image-mask pyramid in the form of list of images and list of masks
"""
assert batch['image'].shape[
0] == 1, 'refiner works on only batches of size 1!'
h, w = batch['unpad_to_size']
h, w = h[0].item(), w[0].item()
image = batch['image'][..., :h, :w]
mask = batch['mask'][..., :h, :w]
if h * w > px_budget:
# resize
ratio = np.sqrt(px_budget / float(h * w))
h_orig, w_orig = h, w
h, w = int(h * ratio), int(w * ratio)
print(
f'Original image too large for refinement! Resizing {(h_orig,w_orig)} to {(h,w)}...'
)
image = resize(
image, (h, w), interpolation='bilinear', align_corners=False)
mask = resize(
mask, (h, w), interpolation='bilinear', align_corners=False)
mask[mask > 1e-8] = 1
breadth = min(h, w)
n_scales = min(1 + int(round(max(0, np.log2(breadth / min_side)))),
max_scales)
ls_images = []
ls_masks = []
ls_images.append(image)
ls_masks.append(mask)
for _ in range(n_scales - 1):
image_p = _pyrdown(ls_images[-1])
mask_p = _pyrdown_mask(ls_masks[-1])
ls_images.append(image_p)
ls_masks.append(mask_p)
# reverse the lists because we want the lowest resolution image as index 0
return ls_images[::-1], ls_masks[::-1]
def refine_predict(batch: dict, inpainter: nn.Module, gpu_ids: str,
modulo: int, n_iters: int, lr: float, min_side: int,
max_scales: int, px_budget: int):
"""Refines the inpainting of the network
Parameters
----------
batch : dict
image-mask batch, currently we assume the batchsize to be 1
inpainter : nn.Module
the inpainting neural network
gpu_ids : str
the GPU ids of the machine to use. If only single GPU, use: "0,"
modulo : int
pad the image to ensure dimension % modulo == 0
n_iters : int
number of iterations of refinement for each scale
lr : float
learning rate
min_side : int
all sides of image on all scales should be >= min_side / sqrt(2)
max_scales : int
max number of downscaling scales for the image-mask pyramid
px_budget : int
pixels budget. Any image will be resized to satisfy height*width <= px_budget
Returns
-------
torch.Tensor
inpainted image of size (1,3,H,W)
"""
inpainter = inpainter.model
assert not inpainter.training
assert not inpainter.add_noise_kwargs
assert inpainter.concat_mask
gpu_ids = [
f'cuda:{gpuid}' for gpuid in gpu_ids.replace(' ', '').split(',')
if gpuid.isdigit()
]
n_resnet_blocks = 0
first_resblock_ind = 0
found_first_resblock = False
for idl in range(len(inpainter.generator.model)):
if isinstance(inpainter.generator.model[idl], FFCResnetBlock):
n_resnet_blocks += 1
found_first_resblock = True
elif not found_first_resblock:
first_resblock_ind += 1
resblocks_per_gpu = n_resnet_blocks // len(gpu_ids)
devices = [torch.device(gpu_id) for gpu_id in gpu_ids]
# split the model into front, and rear parts
forward_front = inpainter.generator.model[0:first_resblock_ind]
forward_front.to(devices[0])
forward_rears = []
for idd in range(len(gpu_ids)):
if idd < len(gpu_ids) - 1:
forward_rears.append(
inpainter.generator.model[first_resblock_ind
+ resblocks_per_gpu
* (idd):first_resblock_ind
+ resblocks_per_gpu * (idd + 1)])
else:
forward_rears.append(
inpainter.generator.model[first_resblock_ind
+ resblocks_per_gpu * (idd):])
forward_rears[idd].to(devices[idd])
ls_images, ls_masks = _get_image_mask_pyramid(batch, min_side, max_scales,
px_budget)
image_inpainted = None
for ids, (image, mask) in enumerate(zip(ls_images, ls_masks)):
orig_shape = image.shape[2:]
image = pad_tensor_to_modulo(image, modulo)
mask = pad_tensor_to_modulo(mask, modulo)
mask[mask >= 1e-8] = 1.0
mask[mask < 1e-8] = 0.0
image, mask = move_to_device(image, devices[0]), move_to_device(
mask, devices[0])
if image_inpainted is not None:
image_inpainted = move_to_device(image_inpainted, devices[-1])
image_inpainted = _infer(image, mask, forward_front, forward_rears,
image_inpainted, orig_shape, devices, ids,
n_iters, lr)
image_inpainted = image_inpainted[:, :, :orig_shape[0], :orig_shape[1]]
# detach everything to save resources
image = image.detach().cpu()
mask = mask.detach().cpu()
return image_inpainted

View File

@@ -10,7 +10,7 @@ if TYPE_CHECKING:
else:
_import_structure = {
'mmdet_model': ['DetectionModel'],
'yolox_pai': ['YOLOX']
'yolox_pai': ['YOLOX'],
}
import sys

View File

@@ -9,6 +9,9 @@ from modelscope.utils.constant import Tasks
@MODELS.register_module(
group_key=Tasks.image_object_detection, module_name=Models.yolox)
@MODELS.register_module(
group_key=Tasks.image_object_detection,
module_name=Models.image_object_detection_auto)
class YOLOX(EasyCVBaseModel, _YOLOX):
def __init__(self, model_dir=None, *args, **kwargs):

View File

@@ -5,9 +5,11 @@ from modelscope.utils.import_utils import LazyImportModule
if TYPE_CHECKING:
from .realtime_detector import RealtimeDetector
from .realtime_video_detector import RealtimeVideoDetector
else:
_import_structure = {
'realtime_detector': ['RealtimeDetector'],
'realtime_video_detector': ['RealtimeVideoDetector'],
}
import sys

View File

@@ -0,0 +1,117 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
import argparse
import logging as logger
import os
import os.path as osp
import time
import cv2
import json
import torch
from tqdm import tqdm
from modelscope.metainfo import Models
from modelscope.models.base.base_torch_model import TorchModel
from modelscope.models.builder import MODELS
from modelscope.preprocessors import LoadImage
from modelscope.utils.config import Config
from modelscope.utils.constant import ModelFile, Tasks
from .yolox.data.data_augment import ValTransform
from .yolox.exp import get_exp_by_name
from .yolox.utils import postprocess
@MODELS.register_module(
group_key=Tasks.video_object_detection,
module_name=Models.realtime_video_object_detection)
class RealtimeVideoDetector(TorchModel):
def __init__(self, model_dir: str, *args, **kwargs):
super().__init__(model_dir, *args, **kwargs)
self.config = Config.from_file(
os.path.join(self.model_dir, ModelFile.CONFIGURATION))
# model type
self.exp = get_exp_by_name(self.config.model_type)
# build model
self.model = self.exp.get_model()
model_path = osp.join(model_dir, ModelFile.TORCH_MODEL_BIN_FILE)
ckpt = torch.load(model_path, map_location='cpu')
# load the model state dict
self.model.load_state_dict(ckpt['model'])
self.model.eval()
# params setting
self.exp.num_classes = self.config.num_classes
self.confthre = self.config.conf_thr
self.num_classes = self.exp.num_classes
self.nmsthre = self.exp.nmsthre
self.test_size = self.exp.test_size
self.preproc = ValTransform(legacy=False)
self.current_buffer = None
self.label_mapping = self.config['labels']
def inference(self, img):
with torch.no_grad():
outputs, self.current_buffer = self.model(
img, buffer=self.current_buffer, mode='on_pipe')
return outputs
def forward(self, inputs):
return self.inference_video(inputs)
def preprocess(self, img):
img = LoadImage.convert_to_ndarray(img)
height, width = img.shape[:2]
self.ratio = min(self.test_size[0] / img.shape[0],
self.test_size[1] / img.shape[1])
img, _ = self.preproc(img, None, self.test_size)
img = torch.from_numpy(img).unsqueeze(0)
img = img.float()
# Video decoding and preprocessing automatically are not supported by Pipeline/Model
# Sending preprocessed video frame tensor to GPU buffer self-adaptively
if next(self.model.parameters()).is_cuda:
img = img.to(next(self.model.parameters()).device)
return img
def postprocess(self, input):
outputs = postprocess(
input,
self.num_classes,
self.confthre,
self.nmsthre,
class_agnostic=True)
if len(outputs) == 1:
bboxes = outputs[0][:, 0:4].cpu().numpy() / self.ratio
scores = outputs[0][:, 5].cpu().numpy()
labels = outputs[0][:, 6].cpu().int().numpy()
pred_label_names = []
for lab in labels:
pred_label_names.append(self.label_mapping[lab])
return bboxes, scores, pred_label_names
def inference_video(self, v_path):
outputs = []
desc = 'Detecting video: {}'.format(v_path)
for frame, result in tqdm(
self.inference_video_iter(v_path), desc=desc):
outputs.append(result)
return outputs
def inference_video_iter(self, v_path):
capture = cv2.VideoCapture(v_path)
while capture.isOpened():
ret, frame = capture.read()
if not ret:
break
output = self.preprocess(frame)
output = self.inference(output)
output = self.postprocess(output)
yield frame, output

View File

@@ -13,6 +13,8 @@ def get_exp_by_name(exp_name):
from .default import YoloXNanoExp as YoloXExp
elif exp == 'yolox_tiny':
from .default import YoloXTinyExp as YoloXExp
elif exp == 'streamyolo':
from .default import StreamYoloExp as YoloXExp
else:
pass
return YoloXExp()

View File

@@ -1,5 +1,5 @@
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
from .streamyolo import StreamYoloExp
from .yolox_nano import YoloXNanoExp
from .yolox_s import YoloXSExp
from .yolox_tiny import YoloXTinyExp

View File

@@ -0,0 +1,43 @@
# The implementation is based on StreamYOLO, available at https://github.com/yancie-yjr/StreamYOLO
import os
import sys
import torch
from ..yolox_base import Exp as YoloXExp
class StreamYoloExp(YoloXExp):
def __init__(self):
super(YoloXExp, self).__init__()
self.depth = 1.0
self.width = 1.0
self.num_classes = 8
self.test_size = (600, 960)
self.test_conf = 0.3
self.nmsthre = 0.65
def get_model(self):
from ...models import StreamYOLO, DFPPAFPN, TALHead
def init_yolo(M):
for m in M.modules():
if isinstance(m, nn.BatchNorm2d):
m.eps = 1e-3
m.momentum = 0.03
if getattr(self, 'model', None) is None:
in_channels = [256, 512, 1024]
backbone = DFPPAFPN(
self.depth, self.width, in_channels=in_channels)
head = TALHead(
self.num_classes,
self.width,
in_channels=in_channels,
gamma=1.0,
ignore_thr=0.5,
ignore_value=1.6)
self.model = StreamYOLO(backbone, head)
return self.model

View File

@@ -1,5 +1,4 @@
# The implementation is based on YOLOX, available at https://github.com/Megvii-BaseDetection/YOLOX
import os
import random

Some files were not shown because too many files have changed in this diff Show More