From 1625d3187309e7c9ae405907ec0e050dc946bd8f Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 24 Nov 2021 13:31:18 +0800 Subject: [PATCH 01/33] add lstm_pm backbone --- mmpose/models/backbones/hrnet.py | 4 +- mmpose/models/backbones/lstm_pm.py | 420 +++++++++++++++++++++++++++++ 2 files changed, 422 insertions(+), 2 deletions(-) create mode 100644 mmpose/models/backbones/lstm_pm.py diff --git a/mmpose/models/backbones/hrnet.py b/mmpose/models/backbones/hrnet.py index 87dc8cef55..4cdaf946a2 100644 --- a/mmpose/models/backbones/hrnet.py +++ b/mmpose/models/backbones/hrnet.py @@ -215,8 +215,8 @@ def forward(self, x): class HRNet(nn.Module): """HRNet backbone. - `High-Resolution Representations for Labeling Pixels and Regions - `__ + `Deep High-Resolution Representation Learning for Human Pose Estimation + `__ Args: extra (dict): detailed configuration for each stage of HRNet. diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py new file mode 100644 index 0000000000..29937c40bf --- /dev/null +++ b/mmpose/models/backbones/lstm_pm.py @@ -0,0 +1,420 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import copy + +import torch.nn as nn +from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer,\ + constant_init, normal_init) +from torch.nn.modules.batchnorm import _BatchNorm + +from mmpose.utils import get_root_logger +from ..builder import BACKBONES +from .resnet import BasicBlock, Bottleneck, get_expansion +from .utils import load_checkpoint + + +class Init_LSTM(nn.Module): + """Initiate LSTM (Long Short-Term Memory). + + Args: + out_channels (int): Number of output channels. Default: 17. + stem_channels (int): Number of channels of stem features. Default: 32. + hidden_channels (int): Number of channels of hidden state. Default: 48. + """ + + def __init__(self, + out_channels=17, + stem_channels=32, + hidden_channels=48): + + self.conv_gx = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=out_channels + stem_channels + 1, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=True) + self.conv_ix = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=out_channels + stem_channels + 1, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=True) + self.conv_ox = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=out_channels + stem_channels + 1, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=True) + + self.tanh = nn.Tanh() + self.sigmoid = nn.Sigmoid() + + def forward(self, x): + """Forward function.""" + gx = self.conv_gx(x) + ix = self.conv_ix(x) + ox = self.conv_ox(x) + + gx = self.tanh(gx) + ix = self.sigmoid(ix) + ox = self.sigmoid(ox) + + cell_1 = self.tanh(gx * ix) + hidden_1 = ox * cell_1 + return cell_1, hidden_1 + + +class LSTM(nn.Module): + """LSTM (Long Short-Term Memory) for LSTM Pose Mechine. + + Args: + out_channels (int): Number of output channels. Default: 17. + stem_channels (int): Number of channels of stem features. Default: 32. + hidden_channels (int): Number of channels of hidden state. Default: 48. + """ + + def __init__(self, + out_channels=17, + stem_channels=32, + hidden_channels=48): + + self.conv_fx = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=out_channels + stem_channels + 1, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=True) + self.conv_fh = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=hidden_channels, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False) + + self.conv_ix = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=out_channels + stem_channels + 1, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=True) + self.conv_ih = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=hidden_channels, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False) + + self.conv_gx = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=out_channels + stem_channels + 1, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=True) + self.conv_gh = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=hidden_channels, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False) + + self.conv_ox = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=out_channels + stem_channels + 1, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=True) + self.conv_oh = build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=hidden_channels, + out_channels=hidden_channels, + kernel_size=3, + stride=1, + padding=1, + bias=False) + + self.tanh = nn.Tanh() + self.sigmoid = nn.Sigmoid() + + def forward(self, heatmap, feature, centermap, hidden_t_1, cell_t_1): + """Forward function.""" + x_t = torch.cat([heatmap, feature, centermap], dim=1) + + fx = self.conv_fx(x_t) + fh = self.conv_fh(hidden_t_1) + f_sum = fx + fh + f_t = self.sigmoid(f_sum) + + ix = self.conv_ix(x_t) + ih = self.conv_ih(hidden_t_1) + i_sum = ix + ih + i_t = self.sigmoid(i_sum) + + gx = self.conv_gx(x_t) + gh = self.conv_gh(hidden_t_1) + g_sum = gx + gh + g_t = self.tanh(g_sum) + + ox = self.conv_ox(x_t) + oh = self.conv_oh(hidden_t_1) + o_sum = ox + oh + o_t = self.sigmoid(o_sum) + + cell_t = f_t * cell_t_1 + i_t * g_t + hidden_t = o_t * self.tanh(cell_t) + + return cell_t, hidden_t + +@BACKBONES.register_module() +class LSTM_PM(nn.Module): + """LSTM Pose Mechine backbone. + + `LSTM Pose Machines + `__ + + Args: + in_channels (int): Number of input image channels. Default: 3. + out_channels (int): Number of output channels. Default: 17. + stem_channels (int): Number of channels of stem features. Default: 32. + hidden_channels (int): Number of channels of hidden state. Default: 48. + num_stages (int): Numerber of stages for propogation. Default: 9. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict | None): The config dict for norm layers. Default: None. + + TODO: check it after the format of inputs is decided. + Example: + >>> from mmpose.models import LSTM_PM + >>> import torch + >>> self = LSTM_PM(num_stages=3) + >>> self.eval() + >>> images = torch.rand(1, 21, 368, 368) + >>> centermap = torch.rand(1, 1, 368, 368) + >>> heatmaps = self.forward(images, centermap) + >>> for heatmap in heatmaps: + ... print(tuple(heatmap.shape)) + (1, 32, 46, 46) + (1, 32, 46, 46) + (1, 32, 46, 46) + (1, 32, 46, 46) + """ + + def __init__(self, + in_channels=3, + out_channels=17, + stem_channels=32, + hidden_channels=48, + num_stages=7, + conv_cfg=None, + norm_cfg=None): + super().__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.stem_channels = stem_channels + self.hidden_channels = hidden_channels + self.num_stages = num_stages + + self.conv_cfg = conv_cfg + self.norm_cfg = norm_cfg + + self.convnet1 = self._make_convnet1(self.in_channels) + self.convnet2 = self._make_convnet2(self.in_channels) + self.convnet3 = self._make_convnet3() + self.init_lstm = Init_LSTM(self.out_channels, self.stem_channels, self.hidden_channels) + self.lstm = LSTM(self.out_channels, self.stem_channels, self.hidden_channels) + + # TODO: May be generated in dataset as the last channel of target + self.pool_centermap = nn.AvgPool2d(kernel_size=9, stride=8) + + def _make_stem_layers(self, in_channels): + """Make stem layers.""" + layers = [] + layers.append( + ConvModule( + in_channels, + 128, + kernel_size=9, + stride=1, + padding=4, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append(nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) + layers.append( + ConvModule( + 128, + 128, + kernel_size=9, + stride=1, + padding=4, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append(nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) + layers.append( + ConvModule( + 128, + 128, + kernel_size=9, + stride=1, + padding=4, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append(nn.MaxPool2d(kernel_size=3, stride=2, padding=1)) + layers.append( + ConvModule( + 128, + self.stem_channels, + kernel_size=5, + stride=1, + padding=2, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + + return layers + + def _make_convnet1(self, in_channels): + """ConvNet1 for the initial image.""" + layers = self._make_stem_layers(in_channels) + layers.append( + ConvModule( + 32, + 512, + kernel_size=9, + stride=1, + padding=4, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append( + ConvModule( + 512, + 512, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=512, + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0)) + + self.convnet1 = nn.Sequential(*layers) + + def _make_convnet2(self, in_channels): + """ConvNet2 for feature extraction.""" + layers = self._make_stem_layers(in_channels) + return nn.Sequential(*layers) + + def _make_convnet3(self): + """ConvNet3 for output.""" + layers = [] + layers.append( + ConvModule( + self.hidden_channels, + 128, + kernel_size=11, + stride=1, + padding=5, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append( + ConvModule( + 128, + 128, + kernel_size=11, + stride=1, + padding=5, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append( + ConvModule( + 128, + 128, + kernel_size=11, + stride=1, + padding=5, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append( + ConvModule( + 128, + 128, + kernel_size=1, + stride=1, + padding=0, + conv_cfg=self.conv_cfg, + norm_cfg=self.norm_cfg, + inplace=True)) + layers.append( + build_conv_layer( + cfg=dict(type='Conv2d'), + in_channels=128, + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0)) + + return nn.Sequential(*layers) + + def stage1(self, image, cmap): + """Forward function of the first stage.""" + initial_heatmap = self.convnet1(image) + feature = self.convnet2(image) + centermap = self.pool_centermap(cmap) + + x = torch.cat([initial_heatmap, feature, centermap], dim=1) + cell_1, hidden_1 = self.init_lstm(x) + heatmap = self.convnet3(hidden_1) + return initial_heatmap, heatmap, cell_1, hidden_1 + + def stage2(self, image, cmap, heatmap, cell_t_1, hidden_t_1): + """Forward function of the propagation stages.""" + features = self.convnet2(image) + centermap = self.pool_centermap(cmap) + cell_t, hidden_t = self.lstm(heatmap, features, centermap, hidden_t_1, cell_t_1) + current_heatmap = self.convnet3(hidden_t) + return current_heatmap, cell_t, hidden_t + + def forward(self, images, centermap): + """Forward function.""" + heatmaps = [] + + image = images[:, :self.in_channels, :, :] + initial_heatmap, heatmap, cell, hidden = self.stage1(image, centermap) + heatmaps.append(initial_heatmap) + heatmaps.append(heatmap) + + for i in range(1, self.num_stages): + image = images[:, self.in_channels * i: self.in_channels * (i + 1), :, :] + heatmap, cell, hidden = self.stage2(image, centermap, heatmap, cell, hidden) + heat_maps.append(heatmap) + return heatmaps From bf6dbb635f79b04eadb69df02d85c7232a7da552 Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 24 Nov 2021 14:07:06 +0800 Subject: [PATCH 02/33] add backbone init --- mmpose/models/backbones/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py index fb3bf1d01d..c6f14a62f0 100644 --- a/mmpose/models/backbones/__init__.py +++ b/mmpose/models/backbones/__init__.py @@ -4,6 +4,7 @@ from .hourglass import HourglassNet from .hourglass_ae import HourglassAENet from .hrnet import HRNet +from .lstm_pm import LSTM_PM from .litehrnet import LiteHRNet from .mobilenet_v2 import MobileNetV2 from .mobilenet_v3 import MobileNetV3 @@ -28,5 +29,5 @@ 'MobileNetV3', 'RegNet', 'ResNet', 'ResNetV1d', 'ResNeXt', 'SCNet', 'SEResNet', 'SEResNeXt', 'ShuffleNetV1', 'ShuffleNetV2', 'CPM', 'RSN', 'MSPN', 'ResNeSt', 'VGG', 'TCN', 'ViPNAS_ResNet', 'ViPNAS_MobileNetV3', - 'LiteHRNet' + 'LiteHRNet', 'LSTM_PM' ] From 38226cd7bcbecd4bb23a167ea8fa47f9e1277b40 Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 8 Dec 2021 15:04:36 +0800 Subject: [PATCH 03/33] add init_weights function --- mmpose/models/backbones/lstm_pm.py | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py index 29937c40bf..ba1f2b57a4 100644 --- a/mmpose/models/backbones/lstm_pm.py +++ b/mmpose/models/backbones/lstm_pm.py @@ -1,15 +1,12 @@ # Copyright (c) OpenMMLab. All rights reserved. -import copy - +import torch import torch.nn as nn -from mmcv.cnn import (ConvModule, build_conv_layer, build_norm_layer,\ - constant_init, normal_init) +from mmcv.cnn import (ConvModule, build_conv_layer, constant_init, + kaiming_init) from torch.nn.modules.batchnorm import _BatchNorm -from mmpose.utils import get_root_logger from ..builder import BACKBONES -from .resnet import BasicBlock, Bottleneck, get_expansion -from .utils import load_checkpoint +from .base_backbone import BaseBackbone class Init_LSTM(nn.Module): @@ -184,7 +181,7 @@ def forward(self, heatmap, feature, centermap, hidden_t_1, cell_t_1): return cell_t, hidden_t @BACKBONES.register_module() -class LSTM_PM(nn.Module): +class LSTM_PM(BaseBackbone): """LSTM Pose Mechine backbone. `LSTM Pose Machines @@ -404,6 +401,16 @@ def stage2(self, image, cmap, heatmap, cell_t_1, hidden_t_1): current_heatmap = self.convnet3(hidden_t) return current_heatmap, cell_t, hidden_t + def init_weights(self, pretrained=None): + """Initialize the weights in backbone.""" + super().init_weights(pretrained) + if pretrained is None: + for m in self.modules(): + if isinstance(m, nn.Conv2d): + kaiming_init(m) + elif isinstance(m, (_BatchNorm, nn.GroupNorm)): + constant_init(m, 1) + def forward(self, images, centermap): """Forward function.""" heatmaps = [] @@ -416,5 +423,5 @@ def forward(self, images, centermap): for i in range(1, self.num_stages): image = images[:, self.in_channels * i: self.in_channels * (i + 1), :, :] heatmap, cell, hidden = self.stage2(image, centermap, heatmap, cell, hidden) - heat_maps.append(heatmap) + heatmaps.append(heatmap) return heatmaps From 3401989adfb4437c90157a24fdb05f2faad88be5 Mon Sep 17 00:00:00 2001 From: jin-s13 Date: Wed, 8 Dec 2021 16:37:01 +0800 Subject: [PATCH 04/33] format --- docs/faq.md | 8 +-- mmpose/models/backbones/__init__.py | 2 +- mmpose/models/backbones/lstm_pm.py | 91 ++++++++++++++--------------- 3 files changed, 50 insertions(+), 51 deletions(-) diff --git a/docs/faq.md b/docs/faq.md index 035e681a39..c4f818b9d4 100644 --- a/docs/faq.md +++ b/docs/faq.md @@ -36,12 +36,12 @@ If the contents here do not cover your issue, please create an issue using the [ ## Data - **How to convert my 2d keypoint dataset to coco-type?** - + You may refer to this conversion [tool](https://github.com/open-mmlab/mmpose/blob/master/tools/dataset/parse_macaquepose_dataset.py) to prepare your data. Here is an [example](https://github.com/open-mmlab/mmpose/blob/master/tests/data/macaque/test_macaque.json) of the coco-type json. - In the coco-type json, we need "categories", "annotations" and "images". "categories" contain some basic information of the dataset, e.g. class name and keypoint names. - "images" contain image-level information. We need "id", "file_name", "height", "width". Others are optional. - Note: (1) It is okay that "id"s are not continuous or not sorted (e.g. 1000, 40, 352, 333 ...). + In the coco-type json, we need "categories", "annotations" and "images". "categories" contain some basic information of the dataset, e.g. class name and keypoint names. + "images" contain image-level information. We need "id", "file_name", "height", "width". Others are optional. + Note: (1) It is okay that "id"s are not continuous or not sorted (e.g. 1000, 40, 352, 333 ...). "annotations" contain instance-level information. We need "image_id", "id", "keypoints", "num_keypoints", "bbox", "iscrowd", "area", "category_id". Others are optional. Note: (1) "num_keypoints" means the number of visible keypoints. (2) By default, please set "iscrowd: 0". (3) "area" can be calculated using the bbox (area = w * h) (4) Simply set "category_id: 1". (5) The "image_id" in "annotations" should match the "id" in "images". diff --git a/mmpose/models/backbones/__init__.py b/mmpose/models/backbones/__init__.py index c6f14a62f0..ab888b0b8c 100644 --- a/mmpose/models/backbones/__init__.py +++ b/mmpose/models/backbones/__init__.py @@ -4,8 +4,8 @@ from .hourglass import HourglassNet from .hourglass_ae import HourglassAENet from .hrnet import HRNet -from .lstm_pm import LSTM_PM from .litehrnet import LiteHRNet +from .lstm_pm import LSTM_PM from .mobilenet_v2 import MobileNetV2 from .mobilenet_v3 import MobileNetV3 from .mspn import MSPN diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py index ba1f2b57a4..2c107de293 100644 --- a/mmpose/models/backbones/lstm_pm.py +++ b/mmpose/models/backbones/lstm_pm.py @@ -1,8 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import torch import torch.nn as nn -from mmcv.cnn import (ConvModule, build_conv_layer, constant_init, - kaiming_init) +from mmcv.cnn import ConvModule, build_conv_layer, constant_init, kaiming_init from torch.nn.modules.batchnorm import _BatchNorm from ..builder import BACKBONES @@ -18,10 +17,7 @@ class Init_LSTM(nn.Module): hidden_channels (int): Number of channels of hidden state. Default: 48. """ - def __init__(self, - out_channels=17, - stem_channels=32, - hidden_channels=48): + def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): self.conv_gx = build_conv_layer( cfg=dict(type='Conv2d'), @@ -67,7 +63,7 @@ def forward(self, x): class LSTM(nn.Module): - """LSTM (Long Short-Term Memory) for LSTM Pose Mechine. + """LSTM (Long Short-Term Memory) for LSTM Pose Machine. Args: out_channels (int): Number of output channels. Default: 17. @@ -75,10 +71,7 @@ class LSTM(nn.Module): hidden_channels (int): Number of channels of hidden state. Default: 48. """ - def __init__(self, - out_channels=17, - stem_channels=32, - hidden_channels=48): + def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): self.conv_fx = build_conv_layer( cfg=dict(type='Conv2d'), @@ -180,38 +173,39 @@ def forward(self, heatmap, feature, centermap, hidden_t_1, cell_t_1): return cell_t, hidden_t + @BACKBONES.register_module() class LSTM_PM(BaseBackbone): - """LSTM Pose Mechine backbone. - - `LSTM Pose Machines - `__ - - Args: - in_channels (int): Number of input image channels. Default: 3. - out_channels (int): Number of output channels. Default: 17. - stem_channels (int): Number of channels of stem features. Default: 32. - hidden_channels (int): Number of channels of hidden state. Default: 48. - num_stages (int): Numerber of stages for propogation. Default: 9. - conv_cfg (dict | None): The config dict for conv layers. Default: None. - norm_cfg (dict | None): The config dict for norm layers. Default: None. - - TODO: check it after the format of inputs is decided. - Example: - >>> from mmpose.models import LSTM_PM - >>> import torch - >>> self = LSTM_PM(num_stages=3) - >>> self.eval() - >>> images = torch.rand(1, 21, 368, 368) - >>> centermap = torch.rand(1, 1, 368, 368) - >>> heatmaps = self.forward(images, centermap) - >>> for heatmap in heatmaps: - ... print(tuple(heatmap.shape)) - (1, 32, 46, 46) - (1, 32, 46, 46) - (1, 32, 46, 46) - (1, 32, 46, 46) - """ + """LSTM Pose Machine backbone. + + `LSTM Pose Machines + `__ + + Args: + in_channels (int): Number of input image channels. Default: 3. + out_channels (int): Number of output channels. Default: 17. + stem_channels (int): Number of channels of stem features. Default: 32. + hidden_channels (int): Number of channels of hidden state. Default: 48. + num_stages (int): Numerber of stages for propagation. Default: 9. + conv_cfg (dict | None): The config dict for conv layers. Default: None. + norm_cfg (dict | None): The config dict for norm layers. Default: None. + + TODO: check it after the format of inputs is decided. + Example: + >>> from mmpose.models import LSTM_PM + >>> import torch + >>> self = LSTM_PM(num_stages=3) + >>> self.eval() + >>> images = torch.rand(1, 21, 368, 368) + >>> centermap = torch.rand(1, 1, 368, 368) + >>> heatmaps = self.forward(images, centermap) + >>> for heatmap in heatmaps: + ... print(tuple(heatmap.shape)) + (1, 32, 46, 46) + (1, 32, 46, 46) + (1, 32, 46, 46) + (1, 32, 46, 46) + """ def __init__(self, in_channels=3, @@ -234,8 +228,10 @@ def __init__(self, self.convnet1 = self._make_convnet1(self.in_channels) self.convnet2 = self._make_convnet2(self.in_channels) self.convnet3 = self._make_convnet3() - self.init_lstm = Init_LSTM(self.out_channels, self.stem_channels, self.hidden_channels) - self.lstm = LSTM(self.out_channels, self.stem_channels, self.hidden_channels) + self.init_lstm = Init_LSTM(self.out_channels, self.stem_channels, + self.hidden_channels) + self.lstm = LSTM(self.out_channels, self.stem_channels, + self.hidden_channels) # TODO: May be generated in dataset as the last channel of target self.pool_centermap = nn.AvgPool2d(kernel_size=9, stride=8) @@ -397,7 +393,8 @@ def stage2(self, image, cmap, heatmap, cell_t_1, hidden_t_1): """Forward function of the propagation stages.""" features = self.convnet2(image) centermap = self.pool_centermap(cmap) - cell_t, hidden_t = self.lstm(heatmap, features, centermap, hidden_t_1, cell_t_1) + cell_t, hidden_t = self.lstm(heatmap, features, centermap, hidden_t_1, + cell_t_1) current_heatmap = self.convnet3(hidden_t) return current_heatmap, cell_t, hidden_t @@ -421,7 +418,9 @@ def forward(self, images, centermap): heatmaps.append(heatmap) for i in range(1, self.num_stages): - image = images[:, self.in_channels * i: self.in_channels * (i + 1), :, :] - heatmap, cell, hidden = self.stage2(image, centermap, heatmap, cell, hidden) + image = images[:, self.in_channels * i:self.in_channels * + (i + 1), :, :] + heatmap, cell, hidden = self.stage2(image, centermap, heatmap, + cell, hidden) heatmaps.append(heatmap) return heatmaps From 8cb55beab30774cb71b3e91510c4b205ca762d3f Mon Sep 17 00:00:00 2001 From: jin-s13 Date: Wed, 8 Dec 2021 16:53:26 +0800 Subject: [PATCH 05/33] refactor --- mmpose/models/backbones/lstm_pm.py | 145 ++++++++++------------------- 1 file changed, 50 insertions(+), 95 deletions(-) diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py index 2c107de293..ed5f11d7a1 100644 --- a/mmpose/models/backbones/lstm_pm.py +++ b/mmpose/models/backbones/lstm_pm.py @@ -8,60 +8,6 @@ from .base_backbone import BaseBackbone -class Init_LSTM(nn.Module): - """Initiate LSTM (Long Short-Term Memory). - - Args: - out_channels (int): Number of output channels. Default: 17. - stem_channels (int): Number of channels of stem features. Default: 32. - hidden_channels (int): Number of channels of hidden state. Default: 48. - """ - - def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): - - self.conv_gx = build_conv_layer( - cfg=dict(type='Conv2d'), - in_channels=out_channels + stem_channels + 1, - out_channels=hidden_channels, - kernel_size=3, - stride=1, - padding=1, - bias=True) - self.conv_ix = build_conv_layer( - cfg=dict(type='Conv2d'), - in_channels=out_channels + stem_channels + 1, - out_channels=hidden_channels, - kernel_size=3, - stride=1, - padding=1, - bias=True) - self.conv_ox = build_conv_layer( - cfg=dict(type='Conv2d'), - in_channels=out_channels + stem_channels + 1, - out_channels=hidden_channels, - kernel_size=3, - stride=1, - padding=1, - bias=True) - - self.tanh = nn.Tanh() - self.sigmoid = nn.Sigmoid() - - def forward(self, x): - """Forward function.""" - gx = self.conv_gx(x) - ix = self.conv_ix(x) - ox = self.conv_ox(x) - - gx = self.tanh(gx) - ix = self.sigmoid(ix) - ox = self.sigmoid(ox) - - cell_1 = self.tanh(gx * ix) - hidden_1 = ox * cell_1 - return cell_1, hidden_1 - - class LSTM(nn.Module): """LSTM (Long Short-Term Memory) for LSTM Pose Machine. @@ -81,6 +27,7 @@ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): stride=1, padding=1, bias=True) + self.conv_fh = build_conv_layer( cfg=dict(type='Conv2d'), in_channels=hidden_channels, @@ -98,6 +45,7 @@ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): stride=1, padding=1, bias=True) + self.conv_ih = build_conv_layer( cfg=dict(type='Conv2d'), in_channels=hidden_channels, @@ -115,6 +63,7 @@ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): stride=1, padding=1, bias=True) + self.conv_gh = build_conv_layer( cfg=dict(type='Conv2d'), in_channels=hidden_channels, @@ -132,6 +81,7 @@ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): stride=1, padding=1, bias=True) + self.conv_oh = build_conv_layer( cfg=dict(type='Conv2d'), in_channels=hidden_channels, @@ -144,31 +94,46 @@ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): self.tanh = nn.Tanh() self.sigmoid = nn.Sigmoid() - def forward(self, heatmap, feature, centermap, hidden_t_1, cell_t_1): + def init_forward(self, x): + """Forward function.""" + gx = self.conv_gx(x) + ix = self.conv_ix(x) + ox = self.conv_ox(x) + + gx = self.tanh(gx) + ix = self.sigmoid(ix) + ox = self.sigmoid(ox) + + cell_1 = self.tanh(gx * ix) + hidden_1 = ox * cell_1 + + return cell_1, hidden_1 + + def forward(self, heatmap, feature, centermap, hidden_t, cell_t): """Forward function.""" x_t = torch.cat([heatmap, feature, centermap], dim=1) fx = self.conv_fx(x_t) - fh = self.conv_fh(hidden_t_1) + fh = self.conv_fh(hidden_t) f_sum = fx + fh f_t = self.sigmoid(f_sum) ix = self.conv_ix(x_t) - ih = self.conv_ih(hidden_t_1) + ih = self.conv_ih(hidden_t) i_sum = ix + ih i_t = self.sigmoid(i_sum) gx = self.conv_gx(x_t) - gh = self.conv_gh(hidden_t_1) + gh = self.conv_gh(hidden_t) g_sum = gx + gh g_t = self.tanh(g_sum) ox = self.conv_ox(x_t) - oh = self.conv_oh(hidden_t_1) + oh = self.conv_oh(hidden_t) o_sum = ox + oh o_t = self.sigmoid(o_sum) - cell_t = f_t * cell_t_1 + i_t * g_t + cell_t = f_t * cell_t + i_t * g_t hidden_t = o_t * self.tanh(cell_t) return cell_t, hidden_t @@ -225,11 +190,9 @@ def __init__(self, self.conv_cfg = conv_cfg self.norm_cfg = norm_cfg - self.convnet1 = self._make_convnet1(self.in_channels) - self.convnet2 = self._make_convnet2(self.in_channels) - self.convnet3 = self._make_convnet3() - self.init_lstm = Init_LSTM(self.out_channels, self.stem_channels, - self.hidden_channels) + self.conv1 = self._make_conv1(self.in_channels) + self.conv2 = self._make_conv2(self.in_channels) + self.conv3 = self._make_conv3() self.lstm = LSTM(self.out_channels, self.stem_channels, self.hidden_channels) @@ -285,8 +248,8 @@ def _make_stem_layers(self, in_channels): return layers - def _make_convnet1(self, in_channels): - """ConvNet1 for the initial image.""" + def _make_conv1(self, in_channels): + """Make conv1 for the initial image.""" layers = self._make_stem_layers(in_channels) layers.append( ConvModule( @@ -317,15 +280,15 @@ def _make_convnet1(self, in_channels): stride=1, padding=0)) - self.convnet1 = nn.Sequential(*layers) + self.conv1 = nn.Sequential(*layers) - def _make_convnet2(self, in_channels): - """ConvNet2 for feature extraction.""" + def _make_conv2(self, in_channels): + """Make conv2 for feature extraction.""" layers = self._make_stem_layers(in_channels) return nn.Sequential(*layers) - def _make_convnet3(self): - """ConvNet3 for output.""" + def _make_conv3(self): + """Make conv3 for output.""" layers = [] layers.append( ConvModule( @@ -378,26 +341,6 @@ def _make_convnet3(self): return nn.Sequential(*layers) - def stage1(self, image, cmap): - """Forward function of the first stage.""" - initial_heatmap = self.convnet1(image) - feature = self.convnet2(image) - centermap = self.pool_centermap(cmap) - - x = torch.cat([initial_heatmap, feature, centermap], dim=1) - cell_1, hidden_1 = self.init_lstm(x) - heatmap = self.convnet3(hidden_1) - return initial_heatmap, heatmap, cell_1, hidden_1 - - def stage2(self, image, cmap, heatmap, cell_t_1, hidden_t_1): - """Forward function of the propagation stages.""" - features = self.convnet2(image) - centermap = self.pool_centermap(cmap) - cell_t, hidden_t = self.lstm(heatmap, features, centermap, hidden_t_1, - cell_t_1) - current_heatmap = self.convnet3(hidden_t) - return current_heatmap, cell_t, hidden_t - def init_weights(self, pretrained=None): """Initialize the weights in backbone.""" super().init_weights(pretrained) @@ -413,14 +356,26 @@ def forward(self, images, centermap): heatmaps = [] image = images[:, :self.in_channels, :, :] - initial_heatmap, heatmap, cell, hidden = self.stage1(image, centermap) + # Stage1 + initial_heatmap = self.conv1(image) + feature = self.conv2(image) + centermap = self.pool_centermap(centermap) + + x = torch.cat([initial_heatmap, feature, centermap], dim=1) + cell, hidden = self.lstm.init_forward(x) + heatmap = self.conv3(hidden) + heatmaps.append(initial_heatmap) heatmaps.append(heatmap) for i in range(1, self.num_stages): image = images[:, self.in_channels * i:self.in_channels * (i + 1), :, :] - heatmap, cell, hidden = self.stage2(image, centermap, heatmap, - cell, hidden) + features = self.conv2(image) + centermap = self.pool_centermap(centermap) + cell, hidden = self.lstm(heatmap, features, centermap, hidden, + cell) + heatmap = self.conv3(hidden) + heatmaps.append(heatmap) return heatmaps From ed90d42d4952645f6d32892dfebffa7092fee212 Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 8 Dec 2021 19:56:38 +0800 Subject: [PATCH 06/33] add lstm_pm detector --- mmpose/models/detectors/__init__.py | 3 +- mmpose/models/detectors/lstm_pm.py | 177 ++++++++++++++++++++++++++++ 2 files changed, 179 insertions(+), 1 deletion(-) create mode 100644 mmpose/models/detectors/lstm_pm.py diff --git a/mmpose/models/detectors/__init__.py b/mmpose/models/detectors/__init__.py index 07a1dfdb1a..b005f95af4 100644 --- a/mmpose/models/detectors/__init__.py +++ b/mmpose/models/detectors/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .associative_embedding import AssociativeEmbedding from .interhand_3d import Interhand3D +from .lstm_pm import LSTM_PM from .mesh import ParametricMesh from .multi_task import MultiTask from .pose_lifter import PoseLifter @@ -9,5 +10,5 @@ __all__ = [ 'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask', - 'PoseLifter', 'Interhand3D', 'PoseWarper' + 'PoseLifter', 'Interhand3D', 'PoseWarper', 'LSTM_PM' ] diff --git a/mmpose/models/detectors/lstm_pm.py b/mmpose/models/detectors/lstm_pm.py new file mode 100644 index 0000000000..7d7578b1d2 --- /dev/null +++ b/mmpose/models/detectors/lstm_pm.py @@ -0,0 +1,177 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import warnings + +import numpy as np +import torch + +from ..builder import POSENETS +from .top_down import TopDown + +try: + from mmcv.runner import auto_fp16 +except ImportError: + warnings.warn('auto_fp16 from mmpose will be deprecated from v0.15.0' + 'Please install mmcv>=1.1.4') + from mmpose.core import auto_fp16 + + +@POSENETS.register_module() +class LSTM_PM(TopDown): + """Top-down pose detectors for LSTM Pose Mechine. + Paper ref: Luo, Yue, et al. "Lstm pose machines." Proceedings of the IEEE + conference on computer vision and pattern recognition (2018). + + <``https://arxiv.org/abs/1712.06316``> + + A child class of TopDown detector. + + Args: + backbone (dict): Backbone modules to extract features. + neck (dict): intermediate modules to transform features. + keypoint_head (dict): Keypoint head to process feature. + train_cfg (dict): Config for training. Default: None. + test_cfg (dict): Config for testing. Default: None. + pretrained (str): Path to the pretrained models. + loss_pose (None): Deprecated arguments. Please use + `loss_keypoint` for heads instead. + concat_tensors (bool): Whether to concat the tensors on the batch dim, + which can speed up, Default: True + """ + + def __init__(self, + backbone, + neck=None, + keypoint_head=None, + train_cfg=None, + test_cfg=None, + pretrained=None, + loss_pose=None): + super().__init__( + backbone=backbone, + neck=neck, + keypoint_head=keypoint_head, + train_cfg=train_cfg, + test_cfg=test_cfg, + pretrained=pretrained, + loss_pose=loss_pose) + + @auto_fp16(apply_to=('img', )) + def forward(self, + img, + target=None, + target_weight=None, + img_metas=None, + return_loss=True, + return_heatmap=False, + **kwargs): + """Calls either forward_train or forward_test depending on whether + return_loss=True. Note this setting will change the expected inputs. + When `return_loss=True`, img and img_meta are single-nested (i.e. + Tensor and List[dict]), and when `resturn_loss=False`, img and img_meta + should be double nested (i.e. List[Tensor], List[List[dict]]), with + the outer list indicating test time augmentations. + + Note: + number of frames: F + batch_size: N + num_keypoints: K + num_img_channel: C (Default: 3) + img height: imgH + img width: imgW + heatmaps height: H + heatmaps weight: W + + Args: + imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames + target (torch.Tensor[NxKxHxW]): Target heatmaps for one frame. + target_weight (torch.Tensor[NxKx1]): Weights across + different joint types. + img_metas (list(dict)): Information about data augmentation + By default this includes: + - "image_file: paths to multiple video frames + - "center": center of the bbox + - "scale": scale of the bbox + - "rotation": rotation of the bbox + - "bbox_score": score of bbox + return_loss (bool): Option to `return loss`. `return loss=True` + for training, `return loss=False` for validation & test. + return_heatmap (bool) : Option to return heatmap. + + Returns: + dict|tuple: if `return loss` is true, then return losses. + Otherwise, return predicted poses, boxes, image paths + and heatmaps. + """ + if return_loss: + return self.forward_train(img, target, target_weight, img_metas, + **kwargs) + return self.forward_test( + img, img_metas, return_heatmap=return_heatmap, **kwargs) + + def forward_train(self, imgs, target, target_weight, img_metas, **kwargs): + """Defines the computation performed at every call when training.""" + # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames + assert imgs[0].size(0) == len(img_metas) + output = self.backbone(imgs) + if self.with_neck: + output = self.neck(output) + if self.with_keypoint: + output = self.keypoint_head(output) + + output = torch.cat(output, 0) + target = torch.cat(target, 0) + target_weight = torch.cat(target_weight, 0) + + # if return loss + losses = dict() + if self.with_keypoint: + keypoint_losses = self.keypoint_head.get_loss( + output, target, target_weight) + losses.update(keypoint_losses) + keypoint_accuracy = self.keypoint_head.get_accuracy( + output, target, target_weight) + losses.update(keypoint_accuracy) + + return losses + + def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): + """Defines the computation performed at every call when testing.""" + # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames + assert imgs[0].size(0) == len(img_metas) + batch_size, _, img_height, img_width = imgs[0].shape + if batch_size > 1: + assert 'bbox_id' in img_metas[0] + + result = {} + + features = self.backbone(imgs) + if self.with_neck: + features = self.neck(features) + if self.with_keypoint: + output_heatmap = self.keypoint_head.inference_model( + features, flip_pairs=None) + + if self.test_cfg.get('flip_test', True): + imgs_flipped = [img.flip(3) for img in imgs] + features_flipped = self.backbone(imgs_flipped) + if self.with_neck: + features_flipped = self.neck(features_flipped) + if self.with_keypoint: + output_flipped_heatmap = self.keypoint_head.inference_model( + features_flipped, img_metas[0]['flip_pairs']) + output_heatmap = (output_heatmap + + output_flipped_heatmap) * 0.5 + + output_heatmap = torch.cat(output_heatmap, 0) + + if self.with_keypoint: + keypoint_result = self.keypoint_head.decode( + img_metas, output_heatmap, img_size=[img_width, img_height]) + result.update(keypoint_result) + + if not return_heatmap: + output_heatmap = None + + result['output_heatmap'] = output_heatmap + + return result From 0709ae6872e2fc51c7a5e6fe43edfa01fa767223 Mon Sep 17 00:00:00 2001 From: luminxu Date: Fri, 17 Dec 2021 13:09:41 +0800 Subject: [PATCH 07/33] rename detector --- mmpose/models/detectors/__init__.py | 4 ++-- mmpose/models/detectors/lstm_pm.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/mmpose/models/detectors/__init__.py b/mmpose/models/detectors/__init__.py index b005f95af4..6b1bb4c8f5 100644 --- a/mmpose/models/detectors/__init__.py +++ b/mmpose/models/detectors/__init__.py @@ -1,7 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. from .associative_embedding import AssociativeEmbedding from .interhand_3d import Interhand3D -from .lstm_pm import LSTM_PM +from .lstm_pm import LSTMPoseMachine from .mesh import ParametricMesh from .multi_task import MultiTask from .pose_lifter import PoseLifter @@ -10,5 +10,5 @@ __all__ = [ 'TopDown', 'AssociativeEmbedding', 'ParametricMesh', 'MultiTask', - 'PoseLifter', 'Interhand3D', 'PoseWarper', 'LSTM_PM' + 'PoseLifter', 'Interhand3D', 'PoseWarper', 'LSTMPoseMachine' ] diff --git a/mmpose/models/detectors/lstm_pm.py b/mmpose/models/detectors/lstm_pm.py index 7d7578b1d2..2d8483406c 100644 --- a/mmpose/models/detectors/lstm_pm.py +++ b/mmpose/models/detectors/lstm_pm.py @@ -16,8 +16,8 @@ @POSENETS.register_module() -class LSTM_PM(TopDown): - """Top-down pose detectors for LSTM Pose Mechine. +class LSTMPoseMachine(TopDown): + """Top-down pose detectors for LSTM Pose Machine. Paper ref: Luo, Yue, et al. "Lstm pose machines." Proceedings of the IEEE conference on computer vision and pattern recognition (2018). From 6457a460ec4198e59166a37b2ce2797b68962479 Mon Sep 17 00:00:00 2001 From: jin-s13 Date: Tue, 18 Jan 2022 20:56:05 +0800 Subject: [PATCH 08/33] update backbone --- mmpose/models/backbones/lstm_pm.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py index ed5f11d7a1..789bbbb83f 100644 --- a/mmpose/models/backbones/lstm_pm.py +++ b/mmpose/models/backbones/lstm_pm.py @@ -192,7 +192,7 @@ def __init__(self, self.conv1 = self._make_conv1(self.in_channels) self.conv2 = self._make_conv2(self.in_channels) - self.conv3 = self._make_conv3() + self.conv3 = self._make_conv3(self.hidden_channels) self.lstm = LSTM(self.out_channels, self.stem_channels, self.hidden_channels) @@ -287,12 +287,12 @@ def _make_conv2(self, in_channels): layers = self._make_stem_layers(in_channels) return nn.Sequential(*layers) - def _make_conv3(self): + def _make_conv3(self, in_channels): """Make conv3 for output.""" layers = [] layers.append( ConvModule( - self.hidden_channels, + in_channels, 128, kernel_size=11, stride=1, From 05874b2f49d2edae40ccb9cf5e9c95cf0c95907e Mon Sep 17 00:00:00 2001 From: jin-s13 Date: Tue, 18 Jan 2022 22:02:48 +0800 Subject: [PATCH 09/33] add topdown_jhmdb_video_dataset --- .../top_down/topdown_jhmdb_video_dataset.py | 410 ++++++++++++++++++ .../topdown_posetrack18_video_dataset.py | 11 +- 2 files changed, 415 insertions(+), 6 deletions(-) create mode 100644 mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py diff --git a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py new file mode 100644 index 0000000000..12a522ccbf --- /dev/null +++ b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py @@ -0,0 +1,410 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import os +import os.path as osp +import warnings +from collections import OrderedDict + +import json_tricks as json +import numpy as np +from mmcv import Config + +from mmpose.core.evaluation.top_down_eval import keypoint_pck_accuracy +from ...builder import DATASETS +from ..base import Kpt2dSviewRgbVidTopDownDataset + + +@DATASETS.register_module() +class TopDownJhmdbVideoDataset(Kpt2dSviewRgbVidTopDownDataset): + """JhmdbDataset dataset for top-down pose estimation. + + `Towards understanding action recognition + `__ + + The dataset loads raw features and apply specified transforms + to return a dict containing the image tensors and other information. + + sub-JHMDB keypoint indexes:: + 0: "neck", + 1: "belly", + 2: "head", + 3: "right_shoulder", + 4: "left_shoulder", + 5: "right_hip", + 6: "left_hip", + 7: "right_elbow", + 8: "left_elbow", + 9: "right_knee", + 10: "left_knee", + 11: "right_wrist", + 12: "left_wrist", + 13: "right_ankle", + 14: "left_ankle" + + Args: + ann_file (str): Path to the annotation file. + img_prefix (str): Path to a directory where images are held. + Default: None. + data_cfg (dict): config + pipeline (list[dict | callable]): A sequence of data transforms. + dataset_info (DatasetInfo): A class containing all dataset info. + test_mode (bool): Store True when building test or + validation dataset. Default: False. + """ + + def __init__(self, + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=None, + test_mode=False, + ph_fill_len=5): + + if dataset_info is None: + warnings.warn( + 'dataset_info is missing. ' + 'Check https://github.com/open-mmlab/mmpose/pull/663 ' + 'for details.', DeprecationWarning) + cfg = Config.fromfile('configs/_base_/datasets/jhmdb.py') + dataset_info = cfg._cfg_dict['dataset_info'] + + super().__init__( + ann_file, + img_prefix, + data_cfg, + pipeline, + dataset_info=dataset_info, + test_mode=test_mode) + + self.use_gt_bbox = data_cfg['use_gt_bbox'] + self.bbox_file = data_cfg['bbox_file'] + self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) + self.use_nms = data_cfg.get('use_nms', True) + self.soft_nms = data_cfg['soft_nms'] + self.nms_thr = data_cfg['nms_thr'] + self.oks_thr = data_cfg['oks_thr'] + self.vis_thr = data_cfg['vis_thr'] + self.frame_weight_train = data_cfg['frame_weight_train'] + self.frame_weight_test = data_cfg['frame_weight_test'] + self.frame_weight = self.frame_weight_test \ + if self.test_mode else self.frame_weight_train + + self.ph_fill_len = ph_fill_len + + # select the frame indices + self.frame_index_rand = data_cfg.get('frame_index_rand', True) + self.frame_index_range = data_cfg.get('frame_index_range', [-2, 2]) + self.num_adj_frames = data_cfg.get('num_adj_frames', 1) + self.frame_indices_train = data_cfg.get('frame_indices_train', None) + self.frame_indices_test = data_cfg.get('frame_indices_test', + [-2, -1, 0, 1, 2]) + + if self.frame_indices_train is not None: + self.frame_indices_train.sort() + self.frame_indices_test.sort() + + self.db = self._get_db() + + print(f'=> num_images: {self.num_images}') + print(f'=> load {len(self.db)} samples') + + def _get_db(self): + """Load dataset.""" + assert self.use_gt_bbox + gt_db = self._load_coco_keypoint_annotations() + return gt_db + + def _load_coco_keypoint_annotations(self): + """Ground truth bbox and keypoints.""" + gt_db = [] + for img_id in self.img_ids: + gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + return gt_db + + def _load_coco_keypoint_annotation_kernel(self, img_id): + """load annotation from COCOAPI. + + Note: + bbox:[x1, y1, w, h] + Args: + img_id: coco image id + Returns: + dict: db entry + """ + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + file_name = img_ann['file_name'] + nframes = int(img_ann['nframes']) + frame_id = int(img_ann['frame_id']) + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: + continue + x, y, w, h = obj['bbox'] + # JHMDB uses matlab format, index is 1-based, + # we should first convert to 0-based index + x -= 1 + y -= 1 + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + rec = [] + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + + # JHMDB uses matlab format, index is 1-based, + # we should first convert to 0-based index + joints_3d[:, :2] = keypoints[:, :2] - 1 + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_files = [] + cur_image_file = os.path.join(self.img_prefix, + self.id2name[img_id]) + image_files.append(cur_image_file) + + # "images/val/012834_mpii_test/000000.jpg" --> 0 + ref_idx = int(osp.splitext(osp.basename(file_name))[0]) + + # select the frame indices + if not self.test_mode and self.frame_indices_train is not None: + indices = self.frame_indices_train + elif not self.test_mode and self.frame_index_rand: + low, high = self.frame_index_range + indices = np.random.randint(low, high + 1, self.num_adj_frames) + else: + indices = self.frame_indices_test + + for index in indices: + if self.test_mode and index == 0: + continue + # the supporting frame index + support_idx = ref_idx + index + support_idx = np.clip(support_idx, 0, nframes - 1) + sup_image_file = osp.join( + osp.dirname(cur_image_file), + str(support_idx).zfill(self.ph_fill_len) + '.jpg') + + if osp.exists(sup_image_file): + image_files.append(sup_image_file) + else: + warnings.warn(f'{sup_image_file} does not exist, ' + f'use {cur_image_file} instead.') + image_files.append(cur_image_file) + rec.append({ + 'image_file': image_files, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': f'{img_id}_{bbox_id:03}', + 'nframes': nframes, + 'frame_id': frame_id, + 'frame_weight': self.frame_weight + }) + bbox_id = bbox_id + 1 + + return rec + + def _write_keypoint_results(self, keypoints, res_file): + """Write results into a json file.""" + + with open(res_file, 'w') as f: + json.dump(keypoints, f, sort_keys=True, indent=4) + + def _report_metric(self, res_file, metrics, pck_thr=0.2): + """Keypoint evaluation. + + Args: + res_file (str): Json file stored prediction results. + metrics (str | list[str]): Metric to be performed. + Options: 'PCK', 'PCKh', 'AUC', 'EPE'. + pck_thr (float): PCK threshold, default as 0.2. + pckh_thr (float): PCKh threshold, default as 0.7. + auc_nor (float): AUC normalization factor, default as 30 pixel. + + Returns: + List: Evaluation results for evaluation metric. + """ + info_str = [] + + with open(res_file, 'r') as fin: + preds = json.load(fin) + assert len(preds) == len(self.db) + + outputs = [] + gts = [] + masks = [] + threshold_bbox = [] + threshold_torso = [] + + for pred, item in zip(preds, self.db): + outputs.append(np.array(pred['keypoints'])[:, :-1]) + gts.append(np.array(item['joints_3d'])[:, :-1]) + masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0) + if 'PCK' in metrics: + bbox = np.array(item['bbox']) + bbox_thr = np.max(bbox[2:]) + threshold_bbox.append(np.array([bbox_thr, bbox_thr])) + + if 'tPCK' in metrics: + torso_thr = np.linalg.norm(item['joints_3d'][4, :2] - + item['joints_3d'][5, :2]) + if torso_thr < 1: + torso_thr = np.linalg.norm( + np.array(pred['keypoints'])[4, :2] - + np.array(pred['keypoints'])[5, :2]) + warnings.warn('Torso Size < 1.') + threshold_torso.append(np.array([torso_thr, torso_thr])) + + outputs = np.array(outputs) + gts = np.array(gts) + masks = np.array(masks) + threshold_bbox = np.array(threshold_bbox) + threshold_torso = np.array(threshold_torso) + + if 'PCK' in metrics: + pck_p, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr, + threshold_bbox) + + stats_names = [ + 'Head PCK', 'Sho PCK', 'Elb PCK', 'Wri PCK', 'Hip PCK', + 'Knee PCK', 'Ank PCK', 'Mean PCK' + ] + + stats = [ + pck_p[2], 0.5 * pck_p[3] + 0.5 * pck_p[4], + 0.5 * pck_p[7] + 0.5 * pck_p[8], + 0.5 * pck_p[11] + 0.5 * pck_p[12], + 0.5 * pck_p[5] + 0.5 * pck_p[6], + 0.5 * pck_p[9] + 0.5 * pck_p[10], + 0.5 * pck_p[13] + 0.5 * pck_p[14], pck + ] + + info_str.extend(list(zip(stats_names, stats))) + + if 'tPCK' in metrics: + pck_p, pck, _ = keypoint_pck_accuracy(outputs, gts, masks, pck_thr, + threshold_torso) + + stats_names = [ + 'Head tPCK', 'Sho tPCK', 'Elb tPCK', 'Wri tPCK', 'Hip tPCK', + 'Knee tPCK', 'Ank tPCK', 'Mean tPCK' + ] + + stats = [ + pck_p[2], 0.5 * pck_p[3] + 0.5 * pck_p[4], + 0.5 * pck_p[7] + 0.5 * pck_p[8], + 0.5 * pck_p[11] + 0.5 * pck_p[12], + 0.5 * pck_p[5] + 0.5 * pck_p[6], + 0.5 * pck_p[9] + 0.5 * pck_p[10], + 0.5 * pck_p[13] + 0.5 * pck_p[14], pck + ] + + info_str.extend(list(zip(stats_names, stats))) + + return info_str + + def evaluate(self, outputs, res_folder, metric='PCK', **kwargs): + """Evaluate onehand10k keypoint results. The pose prediction results + will be saved in `${res_folder}/result_keypoints.json`. + + Note: + batch_size: N + num_keypoints: K + heatmap height: H + heatmap width: W + + Args: + outputs (list(preds, boxes, image_path, output_heatmap)) + :preds (np.ndarray[N,K,3]): The first two dimensions are + coordinates, score is the third dimension of the array. + :boxes (np.ndarray[N,6]): [center[0], center[1], scale[0] + , scale[1],area, score] + :image_path (list[str]) + :output_heatmap (np.ndarray[N, K, H, W]): model outputs. + + res_folder (str): Path of directory to save the results. + metric (str | list[str]): Metric to be performed. + Options: 'PCK', 'tPCK'. + PCK means normalized by the bounding boxes, while tPCK + means normalized by the torso size. + + Returns: + dict: Evaluation results for evaluation metric. + """ + metrics = metric if isinstance(metric, list) else [metric] + allowed_metrics = ['PCK', 'tPCK'] + for metric in metrics: + if metric not in allowed_metrics: + raise KeyError(f'metric {metric} is not supported') + + res_file = os.path.join(res_folder, 'result_keypoints.json') + + kpts = [] + + for output in outputs: + preds = output['preds'] + boxes = output['boxes'] + image_paths = output['image_paths'] + bbox_ids = output['bbox_ids'] + + # convert 0-based index to 1-based index, + # and get the first two dimensions. + preds[..., :2] += 1.0 + batch_size = len(image_paths) + for i in range(batch_size): + # TODO: It should be extended to include multi-frame cases + if not isinstance(image_paths[i], list): + image_id = self.name2id[image_paths[i] + [len(self.img_prefix):]] + else: + image_id = self.name2id[image_paths[i][0] + [len(self.img_prefix):]] + + kpts.append({ + 'keypoints': preds[i], + 'center': boxes[i][0:2], + 'scale': boxes[i][2:4], + 'area': boxes[i][4], + 'score': boxes[i][5], + 'image_id': image_id, + 'bbox_id': bbox_ids[i] + }) + kpts = self._sort_and_unique_bboxes(kpts) + + self._write_keypoint_results(kpts, res_file) + info_str = self._report_metric(res_file, metrics) + name_value = OrderedDict(info_str) + + return name_value diff --git a/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py b/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py index 43e1c4f1ff..95471a7856 100644 --- a/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_posetrack18_video_dataset.py @@ -187,9 +187,8 @@ def _load_coco_keypoint_annotation_kernel(self, img_id): self.id2name[img_id]) image_files.append(cur_image_file) - # "images/val/012834_mpii_test/000000.jpg" -->> "000000.jpg" - cur_image_name = file_name.split('/')[-1] - ref_idx = int(cur_image_name.replace('.jpg', '')) + # "images/val/012834_mpii_test/000000.jpg" --> 0 + ref_idx = int(osp.splitext(osp.basename(file_name))[0]) # select the frame indices if not self.test_mode and self.frame_indices_train is not None: @@ -206,11 +205,11 @@ def _load_coco_keypoint_annotation_kernel(self, img_id): # the supporting frame index support_idx = ref_idx + index support_idx = np.clip(support_idx, 0, nframes - 1) - sup_image_file = cur_image_file.replace( - cur_image_name, + sup_image_file = osp.join( + osp.dirname(cur_image_file), str(support_idx).zfill(self.ph_fill_len) + '.jpg') - if os.path.exists(sup_image_file): + if osp.exists(sup_image_file): image_files.append(sup_image_file) else: warnings.warn(f'{sup_image_file} does not exist, ' From 610762eeffefe1c38287a58ef11ed736197f135b Mon Sep 17 00:00:00 2001 From: luminxu Date: Thu, 20 Jan 2022 11:26:27 +0800 Subject: [PATCH 10/33] delete centermap in input --- mmpose/models/backbones/lstm_pm.py | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py index 789bbbb83f..0a0cc2d37e 100644 --- a/mmpose/models/backbones/lstm_pm.py +++ b/mmpose/models/backbones/lstm_pm.py @@ -109,9 +109,9 @@ def init_forward(self, x): return cell_1, hidden_1 - def forward(self, heatmap, feature, centermap, hidden_t, cell_t): + def forward(self, heatmap, feature, hidden_t, cell_t): """Forward function.""" - x_t = torch.cat([heatmap, feature, centermap], dim=1) + x_t = torch.cat([heatmap, feature], dim=1) fx = self.conv_fx(x_t) fh = self.conv_fh(hidden_t) @@ -161,9 +161,8 @@ class LSTM_PM(BaseBackbone): >>> import torch >>> self = LSTM_PM(num_stages=3) >>> self.eval() - >>> images = torch.rand(1, 21, 368, 368) - >>> centermap = torch.rand(1, 1, 368, 368) - >>> heatmaps = self.forward(images, centermap) + >>> images = torch.rand(1, 12, 368, 368) + >>> heatmaps = self.forward(images) >>> for heatmap in heatmaps: ... print(tuple(heatmap.shape)) (1, 32, 46, 46) @@ -196,9 +195,6 @@ def __init__(self, self.lstm = LSTM(self.out_channels, self.stem_channels, self.hidden_channels) - # TODO: May be generated in dataset as the last channel of target - self.pool_centermap = nn.AvgPool2d(kernel_size=9, stride=8) - def _make_stem_layers(self, in_channels): """Make stem layers.""" layers = [] @@ -351,30 +347,28 @@ def init_weights(self, pretrained=None): elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) - def forward(self, images, centermap): + def forward(self, images): """Forward function.""" heatmaps = [] - image = images[:, :self.in_channels, :, :] # Stage1 + image = images[:, :self.in_channels, :, :] initial_heatmap = self.conv1(image) feature = self.conv2(image) - centermap = self.pool_centermap(centermap) - x = torch.cat([initial_heatmap, feature, centermap], dim=1) + x = torch.cat([initial_heatmap, feature], dim=1) cell, hidden = self.lstm.init_forward(x) heatmap = self.conv3(hidden) heatmaps.append(initial_heatmap) heatmaps.append(heatmap) + # Stage2 for i in range(1, self.num_stages): image = images[:, self.in_channels * i:self.in_channels * (i + 1), :, :] features = self.conv2(image) - centermap = self.pool_centermap(centermap) - cell, hidden = self.lstm(heatmap, features, centermap, hidden, - cell) + cell, hidden = self.lstm(heatmap, features, hidden, cell) heatmap = self.conv3(hidden) heatmaps.append(heatmap) From 3986b98036444f3a13eb0519e2e0b18ff163a02b Mon Sep 17 00:00:00 2001 From: luminxu Date: Fri, 21 Jan 2022 00:07:05 +0800 Subject: [PATCH 11/33] modify jhmdb video dataset for lstm_pm --- .../top_down/topdown_jhmdb_video_dataset.py | 266 +++++++++--------- 1 file changed, 136 insertions(+), 130 deletions(-) diff --git a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py index 12a522ccbf..5d65b6b054 100644 --- a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py @@ -80,29 +80,28 @@ def __init__(self, self.use_gt_bbox = data_cfg['use_gt_bbox'] self.bbox_file = data_cfg['bbox_file'] self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) - self.use_nms = data_cfg.get('use_nms', True) self.soft_nms = data_cfg['soft_nms'] self.nms_thr = data_cfg['nms_thr'] self.oks_thr = data_cfg['oks_thr'] self.vis_thr = data_cfg['vis_thr'] - self.frame_weight_train = data_cfg['frame_weight_train'] - self.frame_weight_test = data_cfg['frame_weight_test'] - self.frame_weight = self.frame_weight_test \ - if self.test_mode else self.frame_weight_train self.ph_fill_len = ph_fill_len # select the frame indices - self.frame_index_rand = data_cfg.get('frame_index_rand', True) - self.frame_index_range = data_cfg.get('frame_index_range', [-2, 2]) - self.num_adj_frames = data_cfg.get('num_adj_frames', 1) - self.frame_indices_train = data_cfg.get('frame_indices_train', None) - self.frame_indices_test = data_cfg.get('frame_indices_test', - [-2, -1, 0, 1, 2]) - - if self.frame_indices_train is not None: - self.frame_indices_train.sort() - self.frame_indices_test.sort() + frame_indices_train = data_cfg.get('frame_indices_train', + [0, 1, 2, 3, 4]) + frame_indices_test = data_cfg.get('frame_indices_test', + [0, 1, 2, 3, 4]) + self.frame_indices = frame_indices_test if self.test_mode \ + else frame_indices_train + self.frame_indices.sort() + assert 0 in self.frame_indices + + frame_interval_train = data_cfg.get('frame_interval_train', 1) + frame_interval_test = data_cfg.get('frame_interval_test', 5) + self.frame_interval = frame_interval_train if self.test_mode \ + else frame_interval_test + assert self.frame_interval > 0 self.db = self._get_db() @@ -116,125 +115,132 @@ def _get_db(self): return gt_db def _load_coco_keypoint_annotations(self): - """Ground truth bbox and keypoints.""" - gt_db = [] - for img_id in self.img_ids: - gt_db.extend(self._load_coco_keypoint_annotation_kernel(img_id)) + """Load ground truth image annotations and group them into clips.""" + self._load_coco_keypoint_image_annotations() + gt_db = self._form_clip_annotations() return gt_db - def _load_coco_keypoint_annotation_kernel(self, img_id): - """load annotation from COCOAPI. + def _load_coco_keypoint_image_annotations(self): + """load image annotations from COCOAPI.""" + lookup_db = {} + image_db = [] - Note: - bbox:[x1, y1, w, h] - Args: - img_id: coco image id - Returns: - dict: db entry - """ - img_ann = self.coco.loadImgs(img_id)[0] - width = img_ann['width'] - height = img_ann['height'] - num_joints = self.ann_info['num_joints'] - - file_name = img_ann['file_name'] - nframes = int(img_ann['nframes']) - frame_id = int(img_ann['frame_id']) - - ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) - objs = self.coco.loadAnns(ann_ids) - - # sanitize bboxes - valid_objs = [] - for obj in objs: - if 'bbox' not in obj: - continue - x, y, w, h = obj['bbox'] - # JHMDB uses matlab format, index is 1-based, - # we should first convert to 0-based index - x -= 1 - y -= 1 - x1 = max(0, x) - y1 = max(0, y) - x2 = min(width - 1, x1 + max(0, w - 1)) - y2 = min(height - 1, y1 + max(0, h - 1)) - if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: - obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] - valid_objs.append(obj) - objs = valid_objs - - bbox_id = 0 - rec = [] - for obj in objs: - if 'keypoints' not in obj: - continue - if max(obj['keypoints']) == 0: - continue - if 'num_keypoints' in obj and obj['num_keypoints'] == 0: - continue - joints_3d = np.zeros((num_joints, 3), dtype=np.float32) - joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) - - keypoints = np.array(obj['keypoints']).reshape(-1, 3) - - # JHMDB uses matlab format, index is 1-based, - # we should first convert to 0-based index - joints_3d[:, :2] = keypoints[:, :2] - 1 - joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) - - center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) - - image_files = [] - cur_image_file = os.path.join(self.img_prefix, - self.id2name[img_id]) - image_files.append(cur_image_file) - - # "images/val/012834_mpii_test/000000.jpg" --> 0 - ref_idx = int(osp.splitext(osp.basename(file_name))[0]) - - # select the frame indices - if not self.test_mode and self.frame_indices_train is not None: - indices = self.frame_indices_train - elif not self.test_mode and self.frame_index_rand: - low, high = self.frame_index_range - indices = np.random.randint(low, high + 1, self.num_adj_frames) - else: - indices = self.frame_indices_test - - for index in indices: - if self.test_mode and index == 0: + for img_id in self.img_ids: + img_ann = self.coco.loadImgs(img_id)[0] + width = img_ann['width'] + height = img_ann['height'] + num_joints = self.ann_info['num_joints'] + + nframes = int(img_ann['nframes']) + vid_id = int(img_ann['vid_id']) + frame_id = int(int(img_ann['frame_id']) % 10000) + + if vid_id not in lookup_db.keys(): + lookup_db[vid_id] = {} + lookup_db[vid_id]['nframes'] = nframes + if frame_id not in lookup_db[vid_id].keys(): + lookup_db[vid_id][frame_id] = {} + + ann_ids = self.coco.getAnnIds(imgIds=img_id, iscrowd=False) + objs = self.coco.loadAnns(ann_ids) + + # sanitize bboxes + valid_objs = [] + for obj in objs: + if 'bbox' not in obj: continue - # the supporting frame index - support_idx = ref_idx + index - support_idx = np.clip(support_idx, 0, nframes - 1) - sup_image_file = osp.join( - osp.dirname(cur_image_file), - str(support_idx).zfill(self.ph_fill_len) + '.jpg') - - if osp.exists(sup_image_file): - image_files.append(sup_image_file) - else: - warnings.warn(f'{sup_image_file} does not exist, ' - f'use {cur_image_file} instead.') - image_files.append(cur_image_file) - rec.append({ - 'image_file': image_files, - 'center': center, - 'scale': scale, - 'bbox': obj['clean_bbox'][:4], - 'rotation': 0, - 'joints_3d': joints_3d, - 'joints_3d_visible': joints_3d_visible, - 'dataset': self.dataset_name, - 'bbox_score': 1, - 'bbox_id': f'{img_id}_{bbox_id:03}', - 'nframes': nframes, - 'frame_id': frame_id, - 'frame_weight': self.frame_weight - }) - bbox_id = bbox_id + 1 - - return rec + x, y, w, h = obj['bbox'] + # JHMDB uses matlab format, index is 1-based, + # we should first convert to 0-based index + x -= 1 + y -= 1 + x1 = max(0, x) + y1 = max(0, y) + x2 = min(width - 1, x1 + max(0, w - 1)) + y2 = min(height - 1, y1 + max(0, h - 1)) + if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] + valid_objs.append(obj) + objs = valid_objs + + bbox_id = 0 + for obj in objs: + if 'keypoints' not in obj: + continue + if max(obj['keypoints']) == 0: + continue + if 'num_keypoints' in obj and obj['num_keypoints'] == 0: + continue + joints_3d = np.zeros((num_joints, 3), dtype=np.float32) + joints_3d_visible = np.zeros((num_joints, 3), dtype=np.float32) + + keypoints = np.array(obj['keypoints']).reshape(-1, 3) + + # JHMDB uses matlab format, index is 1-based, + # we should first convert to 0-based index + joints_3d[:, :2] = keypoints[:, :2] - 1 + joints_3d_visible[:, :2] = np.minimum(1, keypoints[:, 2:3]) + + center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) + + image_file = os.path.join(self.img_prefix, self.id2name[img_id]) + track_id = obj['track_id'] + rec = { + 'image_file': image_file, + 'center': center, + 'scale': scale, + 'bbox': obj['clean_bbox'][:4], + 'rotation': 0, + 'joints_3d': joints_3d, + 'joints_3d_visible': joints_3d_visible, + 'dataset': self.dataset_name, + 'bbox_score': 1, + 'bbox_id': f'{img_id}_{bbox_id:03}', + 'nframes': nframes, + 'vid_id': vid_id, + 'frame_id': frame_id, + 'track_id': track_id + } + bbox_id = bbox_id + 1 + + rec_id = len(image_db) + lookup_db[vid_id][frame_id][track_id] = rec_id + image_db.append(rec) + + self.lookup_db = lookup_db + self.image_db = image_db + + def _form_clip_annotations(self): + """Group image annotations into clips.""" + gt_db = [] + + for vid_id, vid_info in self.lookup_db.items(): + nframes = vid_info['nframes'] + # frame_id start from 1 + cur_frame_id = 1 + for clip_id in range(int(np.ceil(nframes / self.frame_interval))): + cur_frame_id += clip_id * self.frame_interval + track_ids = vid_info[cur_frame_id].keys() + + for track_id in track_ids: + rec_list = [] + for index in self.frame_indices: + frame_id = np.clip(cur_frame_id + index, 1, nframes) + if track_id not in vid_info[frame_id]: + break + else: + rec_id = vid_info[frame_id][track_id] + rec = self.image_db[rec_id] + rec_list.append(rec) + + if len(rec_list) == len(self.frame_indices): + clip = {} + for key in rec_list[0].keys(): + clip[key] = [] + for rec in rec_list: + clip[key].append(rec[key]) + gt_db.append(clip) + return gt_db def _write_keypoint_results(self, keypoints, res_file): """Write results into a json file.""" From db3dcdadad931c1997f84fe44331f359e47716fb Mon Sep 17 00:00:00 2001 From: luminxu Date: Sun, 23 Jan 2022 17:15:44 +0800 Subject: [PATCH 12/33] pipeline support list input --- mmpose/datasets/pipelines/loading.py | 2 +- .../datasets/pipelines/top_down_transform.py | 193 +++++++++++++----- 2 files changed, 139 insertions(+), 56 deletions(-) diff --git a/mmpose/datasets/pipelines/loading.py b/mmpose/datasets/pipelines/loading.py index 24879158fb..6b1c3638f0 100644 --- a/mmpose/datasets/pipelines/loading.py +++ b/mmpose/datasets/pipelines/loading.py @@ -26,7 +26,7 @@ def __call__(self, results): """Loading image(s) from file.""" image_file = results['image_file'] - if isinstance(image_file, (list, tuple)): + if isinstance(image_file, list): imgs = [] for image in image_file: img = mmcv.imread(image, self.color_type, self.channel_order) diff --git a/mmpose/datasets/pipelines/top_down_transform.py b/mmpose/datasets/pipelines/top_down_transform.py index 0dcce9266d..0ad97ad89c 100644 --- a/mmpose/datasets/pipelines/top_down_transform.py +++ b/mmpose/datasets/pipelines/top_down_transform.py @@ -37,24 +37,39 @@ def __call__(self, results): flipped = False if np.random.rand() <= self.flip_prob: flipped = True - if not isinstance(img, list): + if isinstance(img, list): + img = [i[:, ::-1, :] for i in img] + else: img = img[:, ::-1, :] + + if isinstance(img, list): + if isinstance(joints_3d, list): + assert isinstance(joints_3d_visible, list) + assert isinstance(center, list) + joints_3d_new = [] + joints_3d_visible_new = [] + for j, j_vis, i in zip(joints_3d, joints_3d_visible, img): + j_new, j_vis_new = fliplr_joints( + j, j_vis, i.shape[1], + results['ann_info']['flip_pairs']) + joints_3d_new.append(j_new) + joints_3d_visible_new.append(j_vis_new) + for idx in range(len(img)): + center[idx][0] = img[idx].shape[1] - center[idx][0] - 1 + else: + joints_3d_new, joints_3d_visible_new = fliplr_joints( + joints_3d, joints_3d_visible, img[0].shape[1], + results['ann_info']['flip_pairs']) + center[0] = img[0].shape[1] - center[0] - 1 else: - img = [i[:, ::-1, :] for i in img] - if not isinstance(img, list): - joints_3d, joints_3d_visible = fliplr_joints( + joints_3d_new, joints_3d_visible_new = fliplr_joints( joints_3d, joints_3d_visible, img.shape[1], results['ann_info']['flip_pairs']) center[0] = img.shape[1] - center[0] - 1 - else: - joints_3d, joints_3d_visible = fliplr_joints( - joints_3d, joints_3d_visible, img[0].shape[1], - results['ann_info']['flip_pairs']) - center[0] = img[0].shape[1] - center[0] - 1 results['img'] = img - results['joints_3d'] = joints_3d - results['joints_3d_visible'] = joints_3d_visible + results['joints_3d'] = joints_3d_new + results['joints_3d_visible'] = joints_3d_visible_new results['center'] = center results['flipped'] = flipped @@ -128,15 +143,39 @@ def __call__(self, results): joints_3d = results['joints_3d'] joints_3d_visible = results['joints_3d_visible'] - if (np.sum(joints_3d_visible[:, 0]) > self.num_joints_half_body + if isinstance(joints_3d, list): + assert isinstance(joints_3d_visible, list) + num_visible_list = [] + for j_vis in joints_3d_visible: + num_visible_list.append(np.sum(j_vis[:, 0])) + num_visible = min(num_visible_list) + else: + num_visible = np.sum(joints_3d_visible[:, 0]) + + if (num_visible > self.num_joints_half_body and np.random.rand() < self.prob_half_body): - c_half_body, s_half_body = self.half_body_transform( - results['ann_info'], joints_3d, joints_3d_visible) + if isinstance(joints_3d, list): + center = [] + scale = [] + for j, j_vis in zip(joints_3d, joints_3d_visible): + c_half_body, s_half_body = self.half_body_transform( + results['ann_info'], j, j_vis) + if c_half_body is not None and s_half_body is not None: + center.append(c_half_body) + scale.append(s_half_body) + else: + break + if len(center) == len(joints_3d): + results['center'] = center + results['scale'] = scale - if c_half_body is not None and s_half_body is not None: - results['center'] = c_half_body - results['scale'] = s_half_body + else: + c_half_body, s_half_body = self.half_body_transform( + results['ann_info'], joints_3d, joints_3d_visible) + if c_half_body is not None and s_half_body is not None: + results['center'] = c_half_body + results['scale'] = s_half_body return results @@ -160,19 +199,30 @@ def __init__(self, rot_factor=40, scale_factor=0.5, rot_prob=0.6): def __call__(self, results): """Perform data augmentation with random scaling & rotating.""" - s = results['scale'] + scale = results['scale'] sf = self.scale_factor rf = self.rot_factor s_factor = np.clip(np.random.randn() * sf + 1, 1 - sf, 1 + sf) - s = s * s_factor + if isinstance(scale, list): + s_new = [] + for s in scale: + s_new.append(s * s_factor) + else: + s_new = scale * s_factor r_factor = np.clip(np.random.randn() * rf, -rf * 2, rf * 2) - r = r_factor if np.random.rand() <= self.rot_prob else 0 + r_factor = r_factor if np.random.rand() <= self.rot_prob else 0 + if isinstance(scale, list): + r_new = [] + for _ in scale: + r_new.append(r_factor) + else: + r_new = r_factor - results['scale'] = s - results['rotation'] = r + results['scale'] = s_new + results['rotation'] = r_new return results @@ -200,50 +250,83 @@ def __call__(self, results): img = results['img'] joints_3d = results['joints_3d'] joints_3d_visible = results['joints_3d_visible'] - c = results['center'] - s = results['scale'] - r = results['rotation'] + center = results['center'] + scale = results['scale'] + rot = results['rotation'] if self.use_udp: - trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0) - if not isinstance(img, list): - img = cv2.warpAffine( - img, - trans, (int(image_size[0]), int(image_size[1])), - flags=cv2.INTER_LINEAR) - else: - img = [ - cv2.warpAffine( - i, + if isinstance(center, list): + assert isinstance(scale, list) + assert isinstance(rot, list) + assert isinstance(img, list) + assert isinstance(joints_3d, list) + for i, (c, s, r) in enumerate(center, scale, rot): + trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, + s * 200.0) + img[i] = cv2.warpAffine( + img[i], trans, (int(image_size[0]), int(image_size[1])), - flags=cv2.INTER_LINEAR) for i in img - ] + flags=cv2.INTER_LINEAR) + joints_3d[i][:, 0:2] = \ + warp_affine_joints(joints_3d[i][:, 0:2].copy(), trans) - joints_3d[:, 0:2] = \ - warp_affine_joints(joints_3d[:, 0:2].copy(), trans) + else: + trans = get_warp_matrix(rot, center * 2.0, image_size - 1.0, + scale * 200.0) + if isinstance(img, list): + img = [ + cv2.warpAffine( + i, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) for i in img + ] + else: + img = cv2.warpAffine( + img, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + joints_3d[:, 0:2] = \ + warp_affine_joints(joints_3d[:, 0:2].copy(), trans) else: - trans = get_affine_transform(c, s, r, image_size) - if not isinstance(img, list): - img = cv2.warpAffine( - img, - trans, (int(image_size[0]), int(image_size[1])), - flags=cv2.INTER_LINEAR) + if isinstance(center, list): + assert isinstance(scale, list) + assert isinstance(rot, list) + assert isinstance(img, list) + assert isinstance(joints_3d, list) + assert isinstance(joints_3d_visible, list) + for i, (c, s, r) in enumerate(center, scale, rot): + trans = get_affine_transform(c, s, r, image_size) + img[i] = cv2.warpAffine( + img[i], + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) + for j in range(results['ann_info']['num_joints']): + if joints_3d_visible[i][j, 0] > 0.0: + joints_3d[i][j, 0:2] = affine_transform( + joints_3d[i][j, 0:2], trans) + else: - img = [ - cv2.warpAffine( - i, + trans = get_affine_transform(center, scale, rot, image_size) + if isinstance(img, list): + img = [ + cv2.warpAffine( + i, + trans, (int(image_size[0]), int(image_size[1])), + flags=cv2.INTER_LINEAR) for i in img + ] + else: + img = cv2.warpAffine( + img, trans, (int(image_size[0]), int(image_size[1])), - flags=cv2.INTER_LINEAR) for i in img - ] - for i in range(results['ann_info']['num_joints']): - if joints_3d_visible[i, 0] > 0.0: - joints_3d[i, - 0:2] = affine_transform(joints_3d[i, 0:2], trans) + flags=cv2.INTER_LINEAR) + for i in range(results['ann_info']['num_joints']): + if joints_3d_visible[i, 0] > 0.0: + joints_3d[i, + 0:2] = affine_transform(joints_3d[i, 0:2], trans) results['img'] = img results['joints_3d'] = joints_3d - results['joints_3d_visible'] = joints_3d_visible return results From 4b7f82e8e0666ed576a6523e7c99199124f3e110 Mon Sep 17 00:00:00 2001 From: luminxu Date: Sun, 23 Jan 2022 18:48:01 +0800 Subject: [PATCH 13/33] target pipeline support list input --- .../datasets/pipelines/top_down_transform.py | 162 ++++++++++++++---- 1 file changed, 125 insertions(+), 37 deletions(-) diff --git a/mmpose/datasets/pipelines/top_down_transform.py b/mmpose/datasets/pipelines/top_down_transform.py index 0ad97ad89c..26566437d6 100644 --- a/mmpose/datasets/pipelines/top_down_transform.py +++ b/mmpose/datasets/pipelines/top_down_transform.py @@ -641,20 +641,50 @@ def __call__(self, results): num_joints = cfg['num_joints'] heatmap_size = cfg['heatmap_size'] - target = np.empty( - (0, num_joints, heatmap_size[1], heatmap_size[0]), - dtype=np.float32) - target_weight = np.empty((0, num_joints, 1), dtype=np.float32) - for i in range(num_sigmas): - target_i, target_weight_i = self._msra_generate_target( - cfg, joints_3d, joints_3d_visible, self.sigma[i]) - target = np.concatenate([target, target_i[None]], axis=0) - target_weight = np.concatenate( - [target_weight, target_weight_i[None]], axis=0) + if isinstance(joints_3d, list): + assert isinstance(joints_3d_visible, list) + target = [] + target_weight = [] + for j, j_vis in zip(joints_3d, joints_3d_visible): + t = np.empty( + (0, num_joints, heatmap_size[1], heatmap_size[0]), + dtype=np.float32) + t_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_sigmas): + t_i, t_weight_i = self._msra_generate_target( + cfg, j, j_vis, self.sigma[i]) + t = np.concatenate([t, t_i[None]], axis=0) + t_weight = np.concatenate( + [t_weight, t_weight_i[None]], axis=0) + target.append(t) + target_weight.append(t_weight) + else: + target = np.empty( + (0, num_joints, heatmap_size[1], heatmap_size[0]), + dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_sigmas): + target_i, target_weight_i = self._msra_generate_target( + cfg, joints_3d, joints_3d_visible, self.sigma[i]) + target = np.concatenate([target, target_i[None]], axis=0) + target_weight = np.concatenate( + [target_weight, target_weight_i[None]], axis=0) + else: - target, target_weight = self._msra_generate_target( - results['ann_info'], joints_3d, joints_3d_visible, - self.sigma) + if isinstance(joints_3d, list): + assert isinstance(joints_3d_visible, list) + target = [] + target_weight = [] + for j, j_vis in zip(joints_3d, joints_3d_visible): + t, t_weight = self._msra_generate_target( + results['ann_info'], j, j_vis, + self.sigma) + target.append(t) + target_weight.append(t_weight) + else: + target, target_weight = self._msra_generate_target( + results['ann_info'], joints_3d, joints_3d_visible, + self.sigma) elif self.encoding == 'Megvii': if isinstance(self.kernel, list): @@ -663,18 +693,46 @@ def __call__(self, results): num_joints = cfg['num_joints'] W, H = cfg['heatmap_size'] - target = np.empty((0, num_joints, H, W), dtype=np.float32) - target_weight = np.empty((0, num_joints, 1), dtype=np.float32) - for i in range(num_kernels): - target_i, target_weight_i = self._megvii_generate_target( - cfg, joints_3d, joints_3d_visible, self.kernel[i]) - target = np.concatenate([target, target_i[None]], axis=0) - target_weight = np.concatenate( - [target_weight, target_weight_i[None]], axis=0) + if isinstance(joints_3d, list): + assert isinstance(joints_3d_visible, list) + target = [] + target_weight = [] + for j, j_vis in zip(joints_3d, joints_3d_visible): + t = np.empty((0, num_joints, H, W), dtype=np.float32) + t_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_kernels): + t_i, t_weight_i = self._megvii_generate_target( + cfg, j, j_vis, self.kernel[i]) + t = np.concatenate([t, t_i[None]], axis=0) + t_weight = np.concatenate( + [t_weight, t_weight_i[None]], axis=0) + target.append(t) + target_weight.append(t_weight) + else: + target = np.empty((0, num_joints, H, W), dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_kernels): + target_i, target_weight_i = self._megvii_generate_target( + cfg, joints_3d, joints_3d_visible, self.kernel[i]) + target = np.concatenate([target, target_i[None]], axis=0) + target_weight = np.concatenate( + [target_weight, target_weight_i[None]], axis=0) + else: - target, target_weight = self._megvii_generate_target( - results['ann_info'], joints_3d, joints_3d_visible, - self.kernel) + if isinstance(joints_3d, list): + assert isinstance(joints_3d_visible, list) + target = [] + target_weight = [] + for j, j_vis in zip(joints_3d, joints_3d_visible): + t, t_weight = self._megvii_generate_target( + results['ann_info'], j, j_vis, + self.kernel) + target.append(t) + target_weight.append(t_weight) + else: + target, target_weight = self._megvii_generate_target( + results['ann_info'], joints_3d, joints_3d_visible, + self.kernel) elif self.encoding == 'UDP': if self.target_type.lower() == 'CombinedTarget'.lower(): @@ -692,20 +750,50 @@ def __call__(self, results): num_joints = cfg['num_joints'] W, H = cfg['heatmap_size'] - target = np.empty((0, channel_factor * num_joints, H, W), - dtype=np.float32) - target_weight = np.empty((0, num_joints, 1), dtype=np.float32) - for i in range(num_factors): - target_i, target_weight_i = self._udp_generate_target( - cfg, joints_3d, joints_3d_visible, factors[i], - self.target_type) - target = np.concatenate([target, target_i[None]], axis=0) - target_weight = np.concatenate( - [target_weight, target_weight_i[None]], axis=0) + if isinstance(joints_3d, list): + assert isinstance(joints_3d_visible, list) + target = [] + target_weight = [] + for j, j_vis in zip(joints_3d, joints_3d_visible): + t = np.empty((0, channel_factor * num_joints, H, W), + dtype=np.float32) + t_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_factors): + t_i, t_weight_i = self._udp_generate_target( + cfg, j, j_vis, factors[i], + self.target_type) + t = np.concatenate([t, t_i[None]], axis=0) + t_weight = np.concatenate( + [t_weight, t_weight_i[None]], axis=0) + target.append(t) + target_weight.append(t_weight) + else: + target = np.empty((0, channel_factor * num_joints, H, W), + dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + for i in range(num_factors): + target_i, target_weight_i = self._udp_generate_target( + cfg, joints_3d, joints_3d_visible, factors[i], + self.target_type) + target = np.concatenate([target, target_i[None]], axis=0) + target_weight = np.concatenate( + [target_weight, target_weight_i[None]], axis=0) + else: - target, target_weight = self._udp_generate_target( - results['ann_info'], joints_3d, joints_3d_visible, factors, - self.target_type) + if isinstance(joints_3d, list): + assert isinstance(joints_3d_visible, list) + target = [] + target_weight = [] + for j, j_vis in zip(joints_3d, joints_3d_visible): + t, t_weight = self._udp_generate_target( + results['ann_info'], j, j_vis, factors, + self.target_type) + target.append(t) + target_weight.append(t_weight) + else: + target, target_weight = self._udp_generate_target( + results['ann_info'], joints_3d, joints_3d_visible, factors, + self.target_type) else: raise ValueError( f'Encoding approach {self.encoding} is not supported!') From ea9edb88481b852c429ae43ab5cf92d08b26b31f Mon Sep 17 00:00:00 2001 From: luminxu Date: Tue, 25 Jan 2022 11:51:18 +0800 Subject: [PATCH 14/33] modify lstm_pm detector --- mmpose/models/detectors/lstm_pm.py | 35 ++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 12 deletions(-) diff --git a/mmpose/models/detectors/lstm_pm.py b/mmpose/models/detectors/lstm_pm.py index 2d8483406c..551e0949aa 100644 --- a/mmpose/models/detectors/lstm_pm.py +++ b/mmpose/models/detectors/lstm_pm.py @@ -3,6 +3,7 @@ import numpy as np import torch +import copy from ..builder import POSENETS from .top_down import TopDown @@ -118,10 +119,6 @@ def forward_train(self, imgs, target, target_weight, img_metas, **kwargs): if self.with_keypoint: output = self.keypoint_head(output) - output = torch.cat(output, 0) - target = torch.cat(target, 0) - target_weight = torch.cat(target_weight, 0) - # if return loss losses = dict() if self.with_keypoint: @@ -159,19 +156,33 @@ def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): if self.with_keypoint: output_flipped_heatmap = self.keypoint_head.inference_model( features_flipped, img_metas[0]['flip_pairs']) - output_heatmap = (output_heatmap + - output_flipped_heatmap) * 0.5 - - output_heatmap = torch.cat(output_heatmap, 0) + for i in range(len(output_heatmap)): + output_heatmap[i] = (output_heatmap[i] + + output_flipped_heatmap[i]) * 0.5 if self.with_keypoint: - keypoint_result = self.keypoint_head.decode( - img_metas, output_heatmap, img_size=[img_width, img_height]) - result.update(keypoint_result) + meta_keys = ['image_file', 'center', 'scale', 'bbox_score', 'bbox_id'] + batch_size = len(img_metas) + num_frame = len(img_metas[0]['image_file']) + for f in range(num_frame): + test_metas = copy.deepcopy(img_metas) + for i in range(batch_size): + for key in meta_keys: + test_metas[i][key] = img_metas[i][key][f] + keypoint_result = self.keypoint_head.decode( + test_metas, output_heatmap[f], img_size=[img_width, img_height]) + + if result == {}: + result.update(keypoint_result) + else: + for key in result.keys(): + result[key] = np.concatenate((result[key], + keypoint_result[key]), axis=0) if not return_heatmap: output_heatmap = None - + else: + output_heatmap = np.concatenate(output_heatmap, axis=0) result['output_heatmap'] = output_heatmap return result From 69173891f1590f4c85d59aff52b7feb062df5d6b Mon Sep 17 00:00:00 2001 From: luminxu Date: Thu, 27 Jan 2022 15:06:29 +0800 Subject: [PATCH 15/33] stack target for multi_stage_head --- mmpose/models/detectors/lstm_pm.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mmpose/models/detectors/lstm_pm.py b/mmpose/models/detectors/lstm_pm.py index 551e0949aa..4fd3438687 100644 --- a/mmpose/models/detectors/lstm_pm.py +++ b/mmpose/models/detectors/lstm_pm.py @@ -122,6 +122,8 @@ def forward_train(self, imgs, target, target_weight, img_metas, **kwargs): # if return loss losses = dict() if self.with_keypoint: + target = torch.stack(target, dim=1) + target_weight = torch.stack(target_weight, dim=1) keypoint_losses = self.keypoint_head.get_loss( output, target, target_weight) losses.update(keypoint_losses) From 19ddf9d05ff1b205fd1872e913c4654789497b2f Mon Sep 17 00:00:00 2001 From: luminxu Date: Tue, 22 Mar 2022 17:33:41 +0800 Subject: [PATCH 16/33] add cfg --- .../lstm_pm_jhmdb_256x256_stage2.py | 149 ++++++++++++++++++ .../top_down/topdown_jhmdb_video_dataset.py | 8 +- .../datasets/pipelines/top_down_transform.py | 52 +++--- mmpose/models/backbones/lstm_pm.py | 4 +- mmpose/models/detectors/lstm_pm.py | 20 ++- 5 files changed, 197 insertions(+), 36 deletions(-) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/posetrack18/lstm_pm_jhmdb_256x256_stage2.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/posetrack18/lstm_pm_jhmdb_256x256_stage2.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/posetrack18/lstm_pm_jhmdb_256x256_stage2.py new file mode 100644 index 0000000000..dc919855d0 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/posetrack18/lstm_pm_jhmdb_256x256_stage2.py @@ -0,0 +1,149 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = None # noqa: E501 +resume_from = None +dist_params = dict(backend='nccl') +cudnn_benchmark = True +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 15]) +total_epochs = 20 +log_config = dict( + interval=100, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + +# model settings +model = dict( + type='LSTMPoseMachine', + pretrained=None, + backbone=dict( + type='LSTM_PM', + out_channels=channel_cfg['num_output_channels'], + num_stages=5, + ), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + num_stages=5, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[192, 256], + heatmap_size=[48, 64], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + use_nms=True, + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + frame_indices_train=[0, 1, 2, 3, 4], + frame_indices_test=[0, 1, 2, 3, 4], +) + +# take care of orders of the transforms +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox', 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=8, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=4), + test_dataloader=dict(samples_per_gpu=4), + train=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) diff --git a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py index 5d65b6b054..c0e037e114 100644 --- a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py @@ -1,6 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. import os -import os.path as osp import warnings from collections import OrderedDict @@ -78,7 +77,6 @@ def __init__(self, test_mode=test_mode) self.use_gt_bbox = data_cfg['use_gt_bbox'] - self.bbox_file = data_cfg['bbox_file'] self.det_bbox_thr = data_cfg.get('det_bbox_thr', 0.0) self.soft_nms = data_cfg['soft_nms'] self.nms_thr = data_cfg['nms_thr'] @@ -158,7 +156,8 @@ def _load_coco_keypoint_image_annotations(self): y1 = max(0, y) x2 = min(width - 1, x1 + max(0, w - 1)) y2 = min(height - 1, y1 + max(0, h - 1)) - if ('area' not in obj or obj['area'] > 0) and x2 > x1 and y2 > y1: + if ('area' not in obj + or obj['area'] > 0) and x2 > x1 and y2 > y1: obj['clean_bbox'] = [x1, y1, x2 - x1, y2 - y1] valid_objs.append(obj) objs = valid_objs @@ -183,7 +182,8 @@ def _load_coco_keypoint_image_annotations(self): center, scale = self._xywh2cs(*obj['clean_bbox'][:4]) - image_file = os.path.join(self.img_prefix, self.id2name[img_id]) + image_file = os.path.join(self.img_prefix, + self.id2name[img_id]) track_id = obj['track_id'] rec = { 'image_file': image_file, diff --git a/mmpose/datasets/pipelines/top_down_transform.py b/mmpose/datasets/pipelines/top_down_transform.py index 26566437d6..93a9d34e58 100644 --- a/mmpose/datasets/pipelines/top_down_transform.py +++ b/mmpose/datasets/pipelines/top_down_transform.py @@ -322,8 +322,8 @@ def __call__(self, results): flags=cv2.INTER_LINEAR) for i in range(results['ann_info']['num_joints']): if joints_3d_visible[i, 0] > 0.0: - joints_3d[i, - 0:2] = affine_transform(joints_3d[i, 0:2], trans) + joints_3d[i, 0:2] = affine_transform( + joints_3d[i, 0:2], trans) results['img'] = img results['joints_3d'] = joints_3d @@ -649,7 +649,8 @@ def __call__(self, results): t = np.empty( (0, num_joints, heatmap_size[1], heatmap_size[0]), dtype=np.float32) - t_weight = np.empty((0, num_joints, 1), dtype=np.float32) + t_weight = np.empty((0, num_joints, 1), + dtype=np.float32) for i in range(num_sigmas): t_i, t_weight_i = self._msra_generate_target( cfg, j, j_vis, self.sigma[i]) @@ -662,11 +663,13 @@ def __call__(self, results): target = np.empty( (0, num_joints, heatmap_size[1], heatmap_size[0]), dtype=np.float32) - target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), + dtype=np.float32) for i in range(num_sigmas): target_i, target_weight_i = self._msra_generate_target( cfg, joints_3d, joints_3d_visible, self.sigma[i]) - target = np.concatenate([target, target_i[None]], axis=0) + target = np.concatenate([target, target_i[None]], + axis=0) target_weight = np.concatenate( [target_weight, target_weight_i[None]], axis=0) @@ -677,8 +680,7 @@ def __call__(self, results): target_weight = [] for j, j_vis in zip(joints_3d, joints_3d_visible): t, t_weight = self._msra_generate_target( - results['ann_info'], j, j_vis, - self.sigma) + results['ann_info'], j, j_vis, self.sigma) target.append(t) target_weight.append(t_weight) else: @@ -699,7 +701,8 @@ def __call__(self, results): target_weight = [] for j, j_vis in zip(joints_3d, joints_3d_visible): t = np.empty((0, num_joints, H, W), dtype=np.float32) - t_weight = np.empty((0, num_joints, 1), dtype=np.float32) + t_weight = np.empty((0, num_joints, 1), + dtype=np.float32) for i in range(num_kernels): t_i, t_weight_i = self._megvii_generate_target( cfg, j, j_vis, self.kernel[i]) @@ -710,11 +713,15 @@ def __call__(self, results): target_weight.append(t_weight) else: target = np.empty((0, num_joints, H, W), dtype=np.float32) - target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), + dtype=np.float32) for i in range(num_kernels): - target_i, target_weight_i = self._megvii_generate_target( - cfg, joints_3d, joints_3d_visible, self.kernel[i]) - target = np.concatenate([target, target_i[None]], axis=0) + target_i, target_weight_i = \ + self._megvii_generate_target( + cfg, joints_3d, joints_3d_visible, + self.kernel[i]) + target = np.concatenate([target, target_i[None]], + axis=0) target_weight = np.concatenate( [target_weight, target_weight_i[None]], axis=0) @@ -725,8 +732,7 @@ def __call__(self, results): target_weight = [] for j, j_vis in zip(joints_3d, joints_3d_visible): t, t_weight = self._megvii_generate_target( - results['ann_info'], j, j_vis, - self.kernel) + results['ann_info'], j, j_vis, self.kernel) target.append(t) target_weight.append(t_weight) else: @@ -756,12 +762,12 @@ def __call__(self, results): target_weight = [] for j, j_vis in zip(joints_3d, joints_3d_visible): t = np.empty((0, channel_factor * num_joints, H, W), - dtype=np.float32) - t_weight = np.empty((0, num_joints, 1), dtype=np.float32) + dtype=np.float32) + t_weight = np.empty((0, num_joints, 1), + dtype=np.float32) for i in range(num_factors): t_i, t_weight_i = self._udp_generate_target( - cfg, j, j_vis, factors[i], - self.target_type) + cfg, j, j_vis, factors[i], self.target_type) t = np.concatenate([t, t_i[None]], axis=0) t_weight = np.concatenate( [t_weight, t_weight_i[None]], axis=0) @@ -770,12 +776,14 @@ def __call__(self, results): else: target = np.empty((0, channel_factor * num_joints, H, W), dtype=np.float32) - target_weight = np.empty((0, num_joints, 1), dtype=np.float32) + target_weight = np.empty((0, num_joints, 1), + dtype=np.float32) for i in range(num_factors): target_i, target_weight_i = self._udp_generate_target( cfg, joints_3d, joints_3d_visible, factors[i], self.target_type) - target = np.concatenate([target, target_i[None]], axis=0) + target = np.concatenate([target, target_i[None]], + axis=0) target_weight = np.concatenate( [target_weight, target_weight_i[None]], axis=0) @@ -792,8 +800,8 @@ def __call__(self, results): target_weight.append(t_weight) else: target, target_weight = self._udp_generate_target( - results['ann_info'], joints_3d, joints_3d_visible, factors, - self.target_type) + results['ann_info'], joints_3d, joints_3d_visible, + factors, self.target_type) else: raise ValueError( f'Encoding approach {self.encoding} is not supported!') diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py index 0a0cc2d37e..e4d1b2b021 100644 --- a/mmpose/models/backbones/lstm_pm.py +++ b/mmpose/models/backbones/lstm_pm.py @@ -173,10 +173,10 @@ class LSTM_PM(BaseBackbone): def __init__(self, in_channels=3, - out_channels=17, + out_channels=15, stem_channels=32, hidden_channels=48, - num_stages=7, + num_stages=5, conv_cfg=None, norm_cfg=None): super().__init__() diff --git a/mmpose/models/detectors/lstm_pm.py b/mmpose/models/detectors/lstm_pm.py index 4fd3438687..21be4dee86 100644 --- a/mmpose/models/detectors/lstm_pm.py +++ b/mmpose/models/detectors/lstm_pm.py @@ -1,9 +1,9 @@ # Copyright (c) OpenMMLab. All rights reserved. +import copy import warnings import numpy as np import torch -import copy from ..builder import POSENETS from .top_down import TopDown @@ -18,9 +18,9 @@ @POSENETS.register_module() class LSTMPoseMachine(TopDown): - """Top-down pose detectors for LSTM Pose Machine. - Paper ref: Luo, Yue, et al. "Lstm pose machines." Proceedings of the IEEE - conference on computer vision and pattern recognition (2018). + """Top-down pose detectors for LSTM Pose Machine. Paper ref: Luo, Yue, et + al. "Lstm pose machines." Proceedings of the IEEE conference on computer + vision and pattern recognition (2018). <``https://arxiv.org/abs/1712.06316``> @@ -163,7 +163,9 @@ def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): output_flipped_heatmap[i]) * 0.5 if self.with_keypoint: - meta_keys = ['image_file', 'center', 'scale', 'bbox_score', 'bbox_id'] + meta_keys = [ + 'image_file', 'center', 'scale', 'bbox_score', 'bbox_id' + ] batch_size = len(img_metas) num_frame = len(img_metas[0]['image_file']) for f in range(num_frame): @@ -172,14 +174,16 @@ def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): for key in meta_keys: test_metas[i][key] = img_metas[i][key][f] keypoint_result = self.keypoint_head.decode( - test_metas, output_heatmap[f], img_size=[img_width, img_height]) + test_metas, + output_heatmap[f], + img_size=[img_width, img_height]) if result == {}: result.update(keypoint_result) else: for key in result.keys(): - result[key] = np.concatenate((result[key], - keypoint_result[key]), axis=0) + result[key] = np.concatenate( + (result[key], keypoint_result[key]), axis=0) if not return_heatmap: output_heatmap = None From 873f5412239153476edf31eb21049097792abb96 Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 23 Mar 2022 13:21:45 +0800 Subject: [PATCH 17/33] fix bug --- .../lstm_pm_jhmdb_256x256_stage2.py | 6 ++-- mmpose/datasets/datasets/__init__.py | 10 ++++--- .../kpt_2d_sview_rgb_vid_top_down_dataset.py | 13 -------- mmpose/datasets/datasets/top_down/__init__.py | 20 +++++-------- .../top_down/topdown_jhmdb_video_dataset.py | 7 +++-- .../datasets/pipelines/top_down_transform.py | 10 +++++-- mmpose/models/backbones/lstm_pm.py | 17 +++++------ mmpose/models/detectors/lstm_pm.py | 3 -- .../heads/topdown_heatmap_multi_stage_head.py | 30 +++++++++++++++---- 9 files changed, 61 insertions(+), 55 deletions(-) rename configs/body/2d_kpt_sview_rgb_vid/lstm_pm/{posetrack18 => jhmdb}/lstm_pm_jhmdb_256x256_stage2.py (97%) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/posetrack18/lstm_pm_jhmdb_256x256_stage2.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_256x256_stage2.py similarity index 97% rename from configs/body/2d_kpt_sview_rgb_vid/lstm_pm/posetrack18/lstm_pm_jhmdb_256x256_stage2.py rename to configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_256x256_stage2.py index dc919855d0..f3dd8e5d21 100644 --- a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/posetrack18/lstm_pm_jhmdb_256x256_stage2.py +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_256x256_stage2.py @@ -59,8 +59,8 @@ modulate_kernel=11)) data_cfg = dict( - image_size=[192, 256], - heatmap_size=[48, 64], + image_size=[256, 256], + heatmap_size=[32, 32], num_output_channels=channel_cfg['num_output_channels'], num_joints=channel_cfg['dataset_joints'], dataset_channel=channel_cfg['dataset_channel'], @@ -74,6 +74,8 @@ det_bbox_thr=0.0, frame_indices_train=[0, 1, 2, 3, 4], frame_indices_test=[0, 1, 2, 3, 4], + frame_interval_train=1, + frame_interval_test=5, ) # take care of orders of the transforms diff --git a/mmpose/datasets/datasets/__init__.py b/mmpose/datasets/datasets/__init__.py index 151c72a994..bf9e3815ad 100644 --- a/mmpose/datasets/datasets/__init__.py +++ b/mmpose/datasets/datasets/__init__.py @@ -19,9 +19,10 @@ from .top_down import (TopDownAicDataset, TopDownCocoDataset, TopDownCocoWholeBodyDataset, TopDownCrowdPoseDataset, TopDownH36MDataset, TopDownHalpeDataset, - TopDownJhmdbDataset, TopDownMhpDataset, - TopDownMpiiDataset, TopDownMpiiTrbDataset, - TopDownOCHumanDataset, TopDownPoseTrack18Dataset, + TopDownJhmdbDataset, TopDownJhmdbVideoDataset, + TopDownMhpDataset, TopDownMpiiDataset, + TopDownMpiiTrbDataset, TopDownOCHumanDataset, + TopDownPoseTrack18Dataset, TopDownPoseTrack18VideoDataset) __all__ = [ @@ -40,5 +41,6 @@ 'Body3DH36MDataset', 'AnimalHorse10Dataset', 'AnimalMacaqueDataset', 'AnimalFlyDataset', 'AnimalLocustDataset', 'AnimalZebraDataset', 'AnimalATRWDataset', 'AnimalPoseDataset', 'TopDownH36MDataset', - 'TopDownHalpeDataset', 'TopDownPoseTrack18VideoDataset' + 'TopDownHalpeDataset', 'TopDownPoseTrack18VideoDataset', + 'TopDownJhmdbVideoDataset' ] diff --git a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py index 7f89443559..3994e7bb52 100644 --- a/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py +++ b/mmpose/datasets/datasets/base/kpt_2d_sview_rgb_vid_top_down_dataset.py @@ -165,19 +165,6 @@ def evaluate(self, outputs, res_folder, metric, *args, **kwargs): def _write_keypoint_results(keypoint_results, gt_folder, pred_folder): """Write results into a json file.""" - @abstractmethod - def _do_keypoint_eval(self, gt_folder, pred_folder): - """Keypoint evaluation. - Args: - gt_folder (str): The folder of the json files storing - ground truth keypoint annotations. - pred_folder (str): The folder of the json files storing - prediction results. - - Returns: - List: Evaluation results for evaluation metric. - """ - def __len__(self): """Get the size of the dataset.""" return len(self.db) diff --git a/mmpose/datasets/datasets/top_down/__init__.py b/mmpose/datasets/datasets/top_down/__init__.py index cc5b46a8b1..8b49464106 100644 --- a/mmpose/datasets/datasets/top_down/__init__.py +++ b/mmpose/datasets/datasets/top_down/__init__.py @@ -6,6 +6,7 @@ from .topdown_h36m_dataset import TopDownH36MDataset from .topdown_halpe_dataset import TopDownHalpeDataset from .topdown_jhmdb_dataset import TopDownJhmdbDataset +from .topdown_jhmdb_video_dataset import TopDownJhmdbVideoDataset from .topdown_mhp_dataset import TopDownMhpDataset from .topdown_mpii_dataset import TopDownMpiiDataset from .topdown_mpii_trb_dataset import TopDownMpiiTrbDataset @@ -14,17 +15,10 @@ from .topdown_posetrack18_video_dataset import TopDownPoseTrack18VideoDataset __all__ = [ - 'TopDownAicDataset', - 'TopDownCocoDataset', - 'TopDownCocoWholeBodyDataset', - 'TopDownCrowdPoseDataset', - 'TopDownMpiiDataset', - 'TopDownMpiiTrbDataset', - 'TopDownOCHumanDataset', - 'TopDownPoseTrack18Dataset', - 'TopDownJhmdbDataset', - 'TopDownMhpDataset', - 'TopDownH36MDataset', - 'TopDownHalpeDataset', - 'TopDownPoseTrack18VideoDataset', + 'TopDownAicDataset', 'TopDownCocoDataset', 'TopDownCocoWholeBodyDataset', + 'TopDownCrowdPoseDataset', 'TopDownMpiiDataset', 'TopDownMpiiTrbDataset', + 'TopDownOCHumanDataset', 'TopDownPoseTrack18Dataset', + 'TopDownJhmdbDataset', 'TopDownMhpDataset', 'TopDownH36MDataset', + 'TopDownHalpeDataset', 'TopDownPoseTrack18VideoDataset', + 'TopDownJhmdbVideoDataset' ] diff --git a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py index c0e037e114..d7938c48c8 100644 --- a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py @@ -97,8 +97,8 @@ def __init__(self, frame_interval_train = data_cfg.get('frame_interval_train', 1) frame_interval_test = data_cfg.get('frame_interval_test', 5) - self.frame_interval = frame_interval_train if self.test_mode \ - else frame_interval_test + self.frame_interval = frame_interval_test if self.test_mode \ + else frame_interval_train assert self.frame_interval > 0 self.db = self._get_db() @@ -219,7 +219,6 @@ def _form_clip_annotations(self): # frame_id start from 1 cur_frame_id = 1 for clip_id in range(int(np.ceil(nframes / self.frame_interval))): - cur_frame_id += clip_id * self.frame_interval track_ids = vid_info[cur_frame_id].keys() for track_id in track_ids: @@ -240,6 +239,8 @@ def _form_clip_annotations(self): for rec in rec_list: clip[key].append(rec[key]) gt_db.append(clip) + + cur_frame_id += self.frame_interval return gt_db def _write_keypoint_results(self, keypoints, res_file): diff --git a/mmpose/datasets/pipelines/top_down_transform.py b/mmpose/datasets/pipelines/top_down_transform.py index 93a9d34e58..8e46014b38 100644 --- a/mmpose/datasets/pipelines/top_down_transform.py +++ b/mmpose/datasets/pipelines/top_down_transform.py @@ -34,7 +34,6 @@ def __call__(self, results): # A flag indicating whether the image is flipped, # which can be used by child class. - flipped = False if np.random.rand() <= self.flip_prob: flipped = True if isinstance(img, list): @@ -67,6 +66,11 @@ def __call__(self, results): results['ann_info']['flip_pairs']) center[0] = img.shape[1] - center[0] - 1 + else: + joints_3d_new = joints_3d + joints_3d_visible_new = joints_3d_visible + flipped = False + results['img'] = img results['joints_3d'] = joints_3d_new results['joints_3d_visible'] = joints_3d_visible_new @@ -260,7 +264,7 @@ def __call__(self, results): assert isinstance(rot, list) assert isinstance(img, list) assert isinstance(joints_3d, list) - for i, (c, s, r) in enumerate(center, scale, rot): + for i, (c, s, r) in enumerate(zip(center, scale, rot)): trans = get_warp_matrix(r, c * 2.0, image_size - 1.0, s * 200.0) img[i] = cv2.warpAffine( @@ -295,7 +299,7 @@ def __call__(self, results): assert isinstance(img, list) assert isinstance(joints_3d, list) assert isinstance(joints_3d_visible, list) - for i, (c, s, r) in enumerate(center, scale, rot): + for i, (c, s, r) in enumerate(zip(center, scale, rot)): trans = get_affine_transform(c, s, r, image_size) img[i] = cv2.warpAffine( img[i], diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py index e4d1b2b021..7cc8e785a1 100644 --- a/mmpose/models/backbones/lstm_pm.py +++ b/mmpose/models/backbones/lstm_pm.py @@ -18,10 +18,10 @@ class LSTM(nn.Module): """ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): - + super().__init__() self.conv_fx = build_conv_layer( cfg=dict(type='Conv2d'), - in_channels=out_channels + stem_channels + 1, + in_channels=out_channels + stem_channels, out_channels=hidden_channels, kernel_size=3, stride=1, @@ -39,7 +39,7 @@ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): self.conv_ix = build_conv_layer( cfg=dict(type='Conv2d'), - in_channels=out_channels + stem_channels + 1, + in_channels=out_channels + stem_channels, out_channels=hidden_channels, kernel_size=3, stride=1, @@ -57,7 +57,7 @@ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): self.conv_gx = build_conv_layer( cfg=dict(type='Conv2d'), - in_channels=out_channels + stem_channels + 1, + in_channels=out_channels + stem_channels, out_channels=hidden_channels, kernel_size=3, stride=1, @@ -75,7 +75,7 @@ def __init__(self, out_channels=17, stem_channels=32, hidden_channels=48): self.conv_ox = build_conv_layer( cfg=dict(type='Conv2d'), - in_channels=out_channels + stem_channels + 1, + in_channels=out_channels + stem_channels, out_channels=hidden_channels, kernel_size=3, stride=1, @@ -276,7 +276,7 @@ def _make_conv1(self, in_channels): stride=1, padding=0)) - self.conv1 = nn.Sequential(*layers) + return nn.Sequential(*layers) def _make_conv2(self, in_channels): """Make conv2 for feature extraction.""" @@ -352,7 +352,7 @@ def forward(self, images): heatmaps = [] # Stage1 - image = images[:, :self.in_channels, :, :] + image = images[0] initial_heatmap = self.conv1(image) feature = self.conv2(image) @@ -365,8 +365,7 @@ def forward(self, images): # Stage2 for i in range(1, self.num_stages): - image = images[:, self.in_channels * i:self.in_channels * - (i + 1), :, :] + image = images[i] features = self.conv2(image) cell, hidden = self.lstm(heatmap, features, hidden, cell) heatmap = self.conv3(hidden) diff --git a/mmpose/models/detectors/lstm_pm.py b/mmpose/models/detectors/lstm_pm.py index 21be4dee86..46e17801e9 100644 --- a/mmpose/models/detectors/lstm_pm.py +++ b/mmpose/models/detectors/lstm_pm.py @@ -3,7 +3,6 @@ import warnings import numpy as np -import torch from ..builder import POSENETS from .top_down import TopDown @@ -122,8 +121,6 @@ def forward_train(self, imgs, target, target_weight, img_metas, **kwargs): # if return loss losses = dict() if self.with_keypoint: - target = torch.stack(target, dim=1) - target_weight = torch.stack(target_weight, dim=1) keypoint_losses = self.keypoint_head.get_loss( output, target, target_weight) losses.update(keypoint_losses) diff --git a/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/mmpose/models/heads/topdown_heatmap_multi_stage_head.py index f9410abb77..f351be0687 100644 --- a/mmpose/models/heads/topdown_heatmap_multi_stage_head.py +++ b/mmpose/models/heads/topdown_heatmap_multi_stage_head.py @@ -128,17 +128,31 @@ def get_loss(self, output, target, target_weight): losses = dict() assert isinstance(output, list) - assert target.dim() == 4 and target_weight.dim() == 3 if isinstance(self.loss, nn.Sequential): assert len(self.loss) == len(output) + identity_loss = False + else: + identity_loss = True + if isinstance(target, list): + assert isinstance(target_weight, list) + identity_target = False + else: + identity_target = True + for i in range(len(output)): - target_i = target - target_weight_i = target_weight - if isinstance(self.loss, nn.Sequential): - loss_func = self.loss[i] + if identity_target: + target_i = target + target_weight_i = target_weight else: + target_i = target[i] + target_weight_i = target_weight[i] + assert target_i.dim() == 4 and target_weight_i.dim() == 3 + + if identity_loss: loss_func = self.loss + else: + loss_func = self.loss[i] loss_i = loss_func(output[i], target_i, target_weight_i) if 'mse_loss' not in losses: losses['mse_loss'] = loss_i @@ -163,6 +177,12 @@ def get_accuracy(self, output, target, target_weight): Weights across different joint types. """ + assert isinstance(output, list) + if isinstance(target, list): + assert isinstance(target_weight, list) + target = target[-1] + target_weight = target_weight[-1] + accuracy = dict() if self.target_type == 'GaussianHeatmap': From cfc86bee095ada2be431e80f01962c195308484c Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 23 Mar 2022 15:43:38 +0800 Subject: [PATCH 18/33] fix bug when testing --- ...y => lstm_pm_jhmdb_sub1_256x256_stage2.py} | 13 ++++--- .../top_down/topdown_jhmdb_video_dataset.py | 14 ++++++- mmpose/models/detectors/lstm_pm.py | 2 +- .../heads/topdown_heatmap_multi_stage_head.py | 37 ++++++++++++------- 4 files changed, 43 insertions(+), 23 deletions(-) rename configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/{lstm_pm_jhmdb_256x256_stage2.py => lstm_pm_jhmdb_sub1_256x256_stage2.py} (93%) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_256x256_stage2.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py similarity index 93% rename from configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_256x256_stage2.py rename to configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py index f3dd8e5d21..608ac8c4c8 100644 --- a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_256x256_stage2.py +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py @@ -22,7 +22,7 @@ step=[8, 15]) total_epochs = 20 log_config = dict( - interval=100, + interval=50, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') @@ -97,7 +97,7 @@ keys=['img', 'target', 'target_weight'], meta_keys=[ 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', - 'rotation', 'bbox', 'flip_pairs' + 'rotation', 'bbox_score', 'flip_pairs' ]), ] @@ -115,7 +115,8 @@ 'img', ], meta_keys=[ - 'image_file', 'center', 'scale', 'rotation', 'bbox', 'flip_pairs' + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' ]), ] @@ -123,10 +124,10 @@ data_root = 'data/jhmdb' data = dict( - samples_per_gpu=8, + samples_per_gpu=64, workers_per_gpu=2, - val_dataloader=dict(samples_per_gpu=4), - test_dataloader=dict(samples_per_gpu=4), + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), train=dict( type='TopDownJhmdbVideoDataset', ann_file=f'{data_root}/annotations/Sub1_train.json', diff --git a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py index d7938c48c8..7758536494 100644 --- a/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py +++ b/mmpose/datasets/datasets/top_down/topdown_jhmdb_video_dataset.py @@ -267,7 +267,7 @@ def _report_metric(self, res_file, metrics, pck_thr=0.2): with open(res_file, 'r') as fin: preds = json.load(fin) - assert len(preds) == len(self.db) + assert len(preds) == len(self.image_db) outputs = [] gts = [] @@ -275,7 +275,7 @@ def _report_metric(self, res_file, metrics, pck_thr=0.2): threshold_bbox = [] threshold_torso = [] - for pred, item in zip(preds, self.db): + for pred, item in zip(preds, self.image_db): outputs.append(np.array(pred['keypoints'])[:, :-1]) gts.append(np.array(item['joints_3d'])[:, :-1]) masks.append((np.array(item['joints_3d_visible'])[:, 0]) > 0) @@ -415,3 +415,13 @@ def evaluate(self, outputs, res_folder, metric='PCK', **kwargs): name_value = OrderedDict(info_str) return name_value + + def _sort_and_unique_bboxes(self, kpts, key='bbox_id'): + """sort kpts and remove the repeated ones.""" + kpts = sorted(kpts, key=lambda x: x[key]) + num = len(kpts) + for i in range(num - 1, 0, -1): + if kpts[i][key] == kpts[i - 1][key]: + del kpts[i] + + return kpts diff --git a/mmpose/models/detectors/lstm_pm.py b/mmpose/models/detectors/lstm_pm.py index 46e17801e9..11a27b02a7 100644 --- a/mmpose/models/detectors/lstm_pm.py +++ b/mmpose/models/detectors/lstm_pm.py @@ -145,7 +145,7 @@ def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): features = self.neck(features) if self.with_keypoint: output_heatmap = self.keypoint_head.inference_model( - features, flip_pairs=None) + features, flip_pairs=None, return_last=False) if self.test_cfg.get('flip_test', True): imgs_flipped = [img.flip(3) for img in imgs] diff --git a/mmpose/models/heads/topdown_heatmap_multi_stage_head.py b/mmpose/models/heads/topdown_heatmap_multi_stage_head.py index f351be0687..b80eaebad1 100644 --- a/mmpose/models/heads/topdown_heatmap_multi_stage_head.py +++ b/mmpose/models/heads/topdown_heatmap_multi_stage_head.py @@ -208,7 +208,7 @@ def forward(self, x): out.append(y) return out - def inference_model(self, x, flip_pairs=None): + def inference_model(self, x, flip_pairs=None, return_last=True): """Inference function. Returns: @@ -218,24 +218,33 @@ def inference_model(self, x, flip_pairs=None): x (List[torch.Tensor[NxKxHxW]]): Input features. flip_pairs (None | list[tuple()): Pairs of keypoints which are mirrored. + return_last (bool): Choose to return the last output or + all the outputs. """ output = self.forward(x) assert isinstance(output, list) - output = output[-1] - if flip_pairs is not None: - # perform flip - output_heatmap = flip_back( - output.detach().cpu().numpy(), - flip_pairs, - target_type=self.target_type) - # feature is not aligned, shift flipped heatmap for higher accuracy - if self.test_cfg.get('shift_heatmap', False): - output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] - else: - output_heatmap = output.detach().cpu().numpy() + output_heatmaps = [] + for i in range(len(output)): + if flip_pairs is not None: + # perform flip + output_heatmap = flip_back( + output[i].detach().cpu().numpy(), + flip_pairs, + target_type=self.target_type) + # feature is not aligned, shift flipped heatmap + # for higher accuracy + if self.test_cfg.get('shift_heatmap', False): + output_heatmap[:, :, :, 1:] = output_heatmap[:, :, :, :-1] + output_heatmaps.append(output_heatmap) + else: + output_heatmap = output[i].detach().cpu().numpy() + output_heatmaps.append(output_heatmap) - return output_heatmap + if return_last: + output_heatmaps = output_heatmaps[-1] + + return output_heatmaps def _make_deconv_layer(self, num_layers, num_filters, num_kernels): """Make deconv layers.""" From 934ca68d5d944782b49ada5cc42cc149ad921a01 Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 23 Mar 2022 15:55:52 +0800 Subject: [PATCH 19/33] adjust log interval --- .../lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py index 608ac8c4c8..9d9cd75ec1 100644 --- a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py @@ -22,7 +22,7 @@ step=[8, 15]) total_epochs = 20 log_config = dict( - interval=50, + interval=10, hooks=[ dict(type='TextLoggerHook'), # dict(type='TensorboardLoggerHook') From 026fcd3be5503ccb42a8982f4ba29a3651dac457 Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 27 Apr 2022 11:18:00 +0800 Subject: [PATCH 20/33] fix val img_prefix --- .../lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py index 9d9cd75ec1..a9c76d5ecf 100644 --- a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py @@ -138,7 +138,7 @@ val=dict( type='TopDownJhmdbVideoDataset', ann_file=f'{data_root}/annotations/Sub1_test.json', - img_prefix=f'{data_root}/images/', + img_prefix=f'{data_root}/', data_cfg=data_cfg, pipeline=val_pipeline, dataset_info={{_base_.dataset_info}}), From 364acd3d4aac3788203671bd1f9d94b61e05aa51 Mon Sep 17 00:00:00 2001 From: luminxu Date: Wed, 27 Apr 2022 11:33:40 +0800 Subject: [PATCH 21/33] reduce batch size --- .../lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py index a9c76d5ecf..f1c1a0e3f0 100644 --- a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2.py @@ -124,7 +124,7 @@ data_root = 'data/jhmdb' data = dict( - samples_per_gpu=64, + samples_per_gpu=32, workers_per_gpu=2, val_dataloader=dict(samples_per_gpu=32), test_dataloader=dict(samples_per_gpu=32), From f35aa949d850accc8c7c6a3e04b419311ef06810 Mon Sep 17 00:00:00 2001 From: luminxu Date: Thu, 28 Apr 2022 15:06:35 +0800 Subject: [PATCH 22/33] train stage1 mpii --- .../jhmdb/lstm_pm_mpii_sub1_256x256_stage1.py | 133 ++++++++++++++++++ mmpose/models/backbones/lstm_pm.py | 10 +- mmpose/models/detectors/lstm_pm.py | 62 ++++---- 3 files changed, 178 insertions(+), 27 deletions(-) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_sub1_256x256_stage1.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_sub1_256x256_stage1.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_sub1_256x256_stage1.py new file mode 100644 index 0000000000..56e2ae1267 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_sub1_256x256_stage1.py @@ -0,0 +1,133 @@ +_base_ = ['../../../../_base_/datasets/mpii.py'] +log_level = 'INFO' +load_from = None +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=10) +evaluation = dict(interval=10, metric='PCKh', save_best='PCKh') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=50, hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=16, + dataset_joints=16, + dataset_channel=list(range(16)), + inference_channel=list(range(16))) + +# model settings +model = dict( + type='LSTMPoseMachine', + pretrained=None, + backbone=dict( + type='LSTM_PM', + out_channels=channel_cfg['num_output_channels'], + num_stages=6, + ), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + num_stages=6, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[32, 32], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + use_gt_bbox=True, + bbox_file=None, +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=['img'], + meta_keys=['image_file', 'center', 'scale', 'rotation', 'flip_pairs']), +] + +test_pipeline = val_pipeline + +data_root = 'data/mpii' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), + train=dict( + type='TopDownMpiiDataset', + ann_file=f'{data_root}/annotations/mpii_train.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownMpiiDataset', + ann_file=f'{data_root}/annotations/mpii_val.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownMpiiDataset', + ann_file=f'{data_root}/annotations/mpii_val.json', + img_prefix=f'{data_root}/images/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) diff --git a/mmpose/models/backbones/lstm_pm.py b/mmpose/models/backbones/lstm_pm.py index 7cc8e785a1..e6cc195590 100644 --- a/mmpose/models/backbones/lstm_pm.py +++ b/mmpose/models/backbones/lstm_pm.py @@ -151,7 +151,7 @@ class LSTM_PM(BaseBackbone): out_channels (int): Number of output channels. Default: 17. stem_channels (int): Number of channels of stem features. Default: 32. hidden_channels (int): Number of channels of hidden state. Default: 48. - num_stages (int): Numerber of stages for propagation. Default: 9. + num_stages (int): Numerber of stages for propagation. Default: 5. conv_cfg (dict | None): The config dict for conv layers. Default: None. norm_cfg (dict | None): The config dict for norm layers. Default: None. @@ -347,8 +347,14 @@ def init_weights(self, pretrained=None): elif isinstance(m, (_BatchNorm, nn.GroupNorm)): constant_init(m, 1) - def forward(self, images): + def forward(self, input): """Forward function.""" + if isinstance(input, list): + assert len(input) == self.num_stages + images = input + else: + images = [input for _ in range(self.num_stages)] + heatmaps = [] # Stage1 diff --git a/mmpose/models/detectors/lstm_pm.py b/mmpose/models/detectors/lstm_pm.py index 11a27b02a7..943f6aefb4 100644 --- a/mmpose/models/detectors/lstm_pm.py +++ b/mmpose/models/detectors/lstm_pm.py @@ -110,8 +110,6 @@ def forward(self, def forward_train(self, imgs, target, target_weight, img_metas, **kwargs): """Defines the computation performed at every call when training.""" - # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames - assert imgs[0].size(0) == len(img_metas) output = self.backbone(imgs) if self.with_neck: output = self.neck(output) @@ -132,9 +130,14 @@ def forward_train(self, imgs, target, target_weight, img_metas, **kwargs): def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): """Defines the computation performed at every call when testing.""" - # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames - assert imgs[0].size(0) == len(img_metas) - batch_size, _, img_height, img_width = imgs[0].shape + if isinstance(imgs, list): + # imgs (list[Fxtorch.Tensor[NxCximgHximgW]]): multiple input frames + assert imgs[0].size(0) == len(img_metas) + batch_size, _, img_height, img_width = imgs[0].shape + else: + assert imgs.size(0) == len(img_metas) + batch_size, _, img_height, img_width = imgs.shape + if batch_size > 1: assert 'bbox_id' in img_metas[0] @@ -148,7 +151,10 @@ def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): features, flip_pairs=None, return_last=False) if self.test_cfg.get('flip_test', True): - imgs_flipped = [img.flip(3) for img in imgs] + if isinstance(imgs, list): + imgs_flipped = [img.flip(3) for img in imgs] + else: + imgs_flipped = imgs.flip(3) features_flipped = self.backbone(imgs_flipped) if self.with_neck: features_flipped = self.neck(features_flipped) @@ -160,27 +166,33 @@ def forward_test(self, imgs, img_metas, return_heatmap=False, **kwargs): output_flipped_heatmap[i]) * 0.5 if self.with_keypoint: - meta_keys = [ - 'image_file', 'center', 'scale', 'bbox_score', 'bbox_id' - ] - batch_size = len(img_metas) - num_frame = len(img_metas[0]['image_file']) - for f in range(num_frame): - test_metas = copy.deepcopy(img_metas) - for i in range(batch_size): - for key in meta_keys: - test_metas[i][key] = img_metas[i][key][f] + if isinstance(imgs, list): + meta_keys = [ + 'image_file', 'center', 'scale', 'bbox_score', 'bbox_id' + ] + num_frame = len(img_metas[0]['image_file']) + for f in range(num_frame): + test_metas = copy.deepcopy(img_metas) + for i in range(batch_size): + for key in meta_keys: + test_metas[i][key] = img_metas[i][key][f] + keypoint_result = self.keypoint_head.decode( + test_metas, + output_heatmap[f], + img_size=[img_width, img_height]) + + if result == {}: + result.update(keypoint_result) + else: + for key in result.keys(): + result[key] = np.concatenate( + (result[key], keypoint_result[key]), axis=0) + else: keypoint_result = self.keypoint_head.decode( - test_metas, - output_heatmap[f], + img_metas, + output_heatmap[-1], img_size=[img_width, img_height]) - - if result == {}: - result.update(keypoint_result) - else: - for key in result.keys(): - result[key] = np.concatenate( - (result[key], keypoint_result[key]), axis=0) + result.update(keypoint_result) if not return_heatmap: output_heatmap = None From 8decb7c652e4d7e07ea4af17013f9d06407a7ce9 Mon Sep 17 00:00:00 2001 From: luminxu Date: Thu, 28 Apr 2022 15:08:55 +0800 Subject: [PATCH 23/33] rename cfg --- ...mpii_sub1_256x256_stage1.py => lstm_pm_mpii_256x256_stage1.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/{lstm_pm_mpii_sub1_256x256_stage1.py => lstm_pm_mpii_256x256_stage1.py} (100%) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_sub1_256x256_stage1.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_256x256_stage1.py similarity index 100% rename from configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_sub1_256x256_stage1.py rename to configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_256x256_stage1.py From ed8a7e6f6f96fd22ac8b045e7750d838d8c35e82 Mon Sep 17 00:00:00 2001 From: luminxu Date: Thu, 28 Apr 2022 17:50:09 +0800 Subject: [PATCH 24/33] input 368 --- ...56x256_stage1.py => lstm_pm_mpii_368x368_stage1.py} | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) rename configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/{lstm_pm_mpii_256x256_stage1.py => lstm_pm_mpii_368x368_stage1.py} (95%) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_256x256_stage1.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_368x368_stage1.py similarity index 95% rename from configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_256x256_stage1.py rename to configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_368x368_stage1.py index 56e2ae1267..a6e38c2086 100644 --- a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_256x256_stage1.py +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_mpii_368x368_stage1.py @@ -55,8 +55,8 @@ modulate_kernel=11)) data_cfg = dict( - image_size=[256, 256], - heatmap_size=[32, 32], + image_size=[368, 368], + heatmap_size=[46, 46], num_output_channels=channel_cfg['num_output_channels'], num_joints=channel_cfg['dataset_joints'], dataset_channel=channel_cfg['dataset_channel'], @@ -105,10 +105,10 @@ data_root = 'data/mpii' data = dict( - samples_per_gpu=32, + samples_per_gpu=16, workers_per_gpu=2, - val_dataloader=dict(samples_per_gpu=32), - test_dataloader=dict(samples_per_gpu=32), + val_dataloader=dict(samples_per_gpu=16), + test_dataloader=dict(samples_per_gpu=16), train=dict( type='TopDownMpiiDataset', ann_file=f'{data_root}/annotations/mpii_train.json', From 51d4b800af47181041db3682f9542c5cb6e6fe05 Mon Sep 17 00:00:00 2001 From: luminxu Date: Fri, 6 May 2022 13:14:20 +0800 Subject: [PATCH 25/33] train jhmdb 368 stage2 --- .../lstm_pm_jhmdb_sub1_368x368_stage2.py | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py new file mode 100644 index 0000000000..7e51a1361c --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py @@ -0,0 +1,152 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = 'work_dirs/lstm_pm_mpii_368x368_stage1/best_PCKh_epoch_210.pth' +resume_from = None +dist_params = dict(backend='nccl') +cudnn_benchmark = True +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 15]) +total_epochs = 20 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + +# model settings +model = dict( + type='LSTMPoseMachine', + pretrained=None, + backbone=dict( + type='LSTM_PM', + out_channels=channel_cfg['num_output_channels'], + num_stages=5, + ), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + num_stages=5, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[368, 368], + heatmap_size=[46, 46], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + use_nms=True, + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + frame_indices_train=[0, 1, 2, 3, 4], + frame_indices_test=[0, 1, 2, 3, 4], + frame_interval_train=1, + frame_interval_test=5, +) + +# take care of orders of the transforms +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), + train=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) From 60827385a36ff0f68a990a25a2b1498529812b46 Mon Sep 17 00:00:00 2001 From: luminxu Date: Fri, 6 May 2022 13:50:24 +0800 Subject: [PATCH 26/33] bs 16 --- .../lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py index 7e51a1361c..06120498ad 100644 --- a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2.py @@ -124,10 +124,10 @@ data_root = 'data/jhmdb' data = dict( - samples_per_gpu=32, + samples_per_gpu=16, workers_per_gpu=2, - val_dataloader=dict(samples_per_gpu=32), - test_dataloader=dict(samples_per_gpu=32), + val_dataloader=dict(samples_per_gpu=16), + test_dataloader=dict(samples_per_gpu=16), train=dict( type='TopDownJhmdbVideoDataset', ann_file=f'{data_root}/annotations/Sub1_train.json', From d869be1a6f78cd9f9d828167b995037d2623a68a Mon Sep 17 00:00:00 2001 From: luminxu Date: Fri, 6 May 2022 15:28:03 +0800 Subject: [PATCH 27/33] train 210 epoch --- ...lstm_pm_jhmdb_sub1_256x256_stage2_ep210.py | 152 ++++++++++++++++++ ...lstm_pm_jhmdb_sub1_368x368_stage2_ep210.py | 152 ++++++++++++++++++ 2 files changed, 304 insertions(+) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2_ep210.py create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2_ep210.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2_ep210.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2_ep210.py new file mode 100644 index 0000000000..2db3127531 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_256x256_stage2_ep210.py @@ -0,0 +1,152 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = None # noqa: E501 +resume_from = None +dist_params = dict(backend='nccl') +cudnn_benchmark = True +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + +# model settings +model = dict( + type='LSTMPoseMachine', + pretrained=None, + backbone=dict( + type='LSTM_PM', + out_channels=channel_cfg['num_output_channels'], + num_stages=5, + ), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + num_stages=5, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[32, 32], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + use_nms=True, + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + frame_indices_train=[0, 1, 2, 3, 4], + frame_indices_test=[0, 1, 2, 3, 4], + frame_interval_train=1, + frame_interval_test=5, +) + +# take care of orders of the transforms +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), + train=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2_ep210.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2_ep210.py new file mode 100644 index 0000000000..7540139cbc --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage2_ep210.py @@ -0,0 +1,152 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = 'work_dirs/lstm_pm_mpii_368x368_stage1/best_PCKh_epoch_210.pth' +resume_from = None +dist_params = dict(backend='nccl') +cudnn_benchmark = True +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + +# model settings +model = dict( + type='LSTMPoseMachine', + pretrained=None, + backbone=dict( + type='LSTM_PM', + out_channels=channel_cfg['num_output_channels'], + num_stages=5, + ), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + num_stages=5, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[368, 368], + heatmap_size=[46, 46], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + use_nms=True, + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + frame_indices_train=[0, 1, 2, 3, 4], + frame_indices_test=[0, 1, 2, 3, 4], + frame_interval_train=1, + frame_interval_test=5, +) + +# take care of orders of the transforms +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=16, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=16), + test_dataloader=dict(samples_per_gpu=16), + train=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) From 1def7dfd5e8872146c33e636ed3358ed906f4f3b Mon Sep 17 00:00:00 2001 From: luminxu Date: Sat, 4 Jun 2022 11:24:04 +0800 Subject: [PATCH 28/33] train cpm jhmdb 50epoch --- .../jhmdb/cpm_jhmdb_sub1_256x256_ep50.py | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py new file mode 100644 index 0000000000..57f872db23 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py @@ -0,0 +1,148 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = None # noqa: E501 +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[20, 30]) +total_epochs = 40 +log_config = dict( + interval=50, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + +# model settings +model = dict( + type='TopDown', + pretrained=None, + backbone=dict( + type='CPM', + in_channels=3, + out_channels=channel_cfg['num_output_channels'], + feat_channels=128, + num_stages=6), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + in_channels=channel_cfg['num_output_channels'], + out_channels=channel_cfg['num_output_channels'], + num_stages=6, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=True, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[368, 368], + heatmap_size=[46, 46], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox', 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), + train=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) From 69396723ade74ba990219bb5fc284b3217603dbd Mon Sep 17 00:00:00 2001 From: luminxu Date: Sat, 4 Jun 2022 11:31:09 +0800 Subject: [PATCH 29/33] train cpm jhmdb 50epoch --- .../lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py index 57f872db23..e217938393 100644 --- a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256_ep50.py @@ -55,14 +55,14 @@ loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), train_cfg=dict(), test_cfg=dict( - flip_test=True, + flip_test=False, post_process='default', shift_heatmap=True, modulate_kernel=11)) data_cfg = dict( - image_size=[368, 368], - heatmap_size=[46, 46], + image_size=[256, 256], + heatmap_size=[32, 32], num_output_channels=channel_cfg['num_output_channels'], num_joints=channel_cfg['dataset_joints'], dataset_channel=channel_cfg['dataset_channel'], From b3ef167857c4a3c521fa70b9178a95fcd320e2c5 Mon Sep 17 00:00:00 2001 From: luminxu Date: Sat, 4 Jun 2022 11:33:45 +0800 Subject: [PATCH 30/33] train cpm jhmdb 20epoch --- .../lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256.py | 148 ++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256.py new file mode 100644 index 0000000000..05a31e6019 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/cpm_jhmdb_sub1_256x256.py @@ -0,0 +1,148 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = None # noqa: E501 +resume_from = None +dist_params = dict(backend='nccl') +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 15]) +total_epochs = 20 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + +# model settings +model = dict( + type='TopDown', + pretrained=None, + backbone=dict( + type='CPM', + in_channels=3, + out_channels=channel_cfg['num_output_channels'], + feat_channels=128, + num_stages=6), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + in_channels=channel_cfg['num_output_channels'], + out_channels=channel_cfg['num_output_channels'], + num_stages=6, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[256, 256], + heatmap_size=[32, 32], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', +) + +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox', 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=32, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=32), + test_dataloader=dict(samples_per_gpu=32), + train=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) From 4e98c1a9f99c1e0c8e6cc42737af7bc5fa3c67fd Mon Sep 17 00:00:00 2001 From: luminxu Date: Sat, 4 Jun 2022 11:53:22 +0800 Subject: [PATCH 31/33] train lstm pm stage1.5 --- .../lstm_pm_jhmdb_sub1_368x368_stage15.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage15.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage15.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage15.py new file mode 100644 index 0000000000..dc2132746e --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage15.py @@ -0,0 +1,149 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = 'work_dirs/lstm_pm_mpii_368x368_stage1/best_PCKh_epoch_210.pth' +resume_from = None +dist_params = dict(backend='nccl') +cudnn_benchmark = True +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[8, 15]) +total_epochs = 20 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + + +# model settings +model = dict( + type='LSTMPoseMachine', + pretrained=None, + backbone=dict( + type='LSTM_PM', + out_channels=channel_cfg['num_output_channels'], + num_stages=5, + ), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + num_stages=5, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[368, 368], + heatmap_size=[46, 46], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', +) + +# take care of orders of the transforms +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=16, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=16), + test_dataloader=dict(samples_per_gpu=16), + train=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) From 35cdf1598b3503dc1aa7477d39c9899d5c152742 Mon Sep 17 00:00:00 2001 From: luminxu Date: Sat, 4 Jun 2022 11:59:34 +0800 Subject: [PATCH 32/33] train lstm pm stage1.5 210epoch --- ...stm_pm_jhmdb_sub1_368x368_stage15_ep210.py | 149 ++++++++++++++++++ 1 file changed, 149 insertions(+) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage15_ep210.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage15_ep210.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage15_ep210.py new file mode 100644 index 0000000000..b9e9461a3c --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage15_ep210.py @@ -0,0 +1,149 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = 'work_dirs/lstm_pm_mpii_368x368_stage1/best_PCKh_epoch_210.pth' +resume_from = None +dist_params = dict(backend='nccl') +cudnn_benchmark = True +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + + +# model settings +model = dict( + type='LSTMPoseMachine', + pretrained=None, + backbone=dict( + type='LSTM_PM', + out_channels=channel_cfg['num_output_channels'], + num_stages=5, + ), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + num_stages=5, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[368, 368], + heatmap_size=[46, 46], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + bbox_file='', +) + +# take care of orders of the transforms +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=16, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=16), + test_dataloader=dict(samples_per_gpu=16), + train=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +) From d30032d46525148e8742e45c84e4de54e8d3f095 Mon Sep 17 00:00:00 2001 From: luminxu Date: Mon, 6 Jun 2022 10:15:13 +0800 Subject: [PATCH 33/33] train lstm pm stage2.5 210epoch --- ...stm_pm_jhmdb_sub1_368x368_stage25_ep210.py | 152 ++++++++++++++++++ 1 file changed, 152 insertions(+) create mode 100644 configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage25_ep210.py diff --git a/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage25_ep210.py b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage25_ep210.py new file mode 100644 index 0000000000..e1b5c7d576 --- /dev/null +++ b/configs/body/2d_kpt_sview_rgb_vid/lstm_pm/jhmdb/lstm_pm_jhmdb_sub1_368x368_stage25_ep210.py @@ -0,0 +1,152 @@ +_base_ = ['../../../../_base_/datasets/jhmdb.py'] +log_level = 'INFO' +load_from = 'work_dirs/lstm_pm_jhmdb_sub1_368x368_stage15_ep210/latest.pth' +resume_from = None +dist_params = dict(backend='nccl') +cudnn_benchmark = True +workflow = [('train', 1)] +checkpoint_config = dict(interval=1) +evaluation = dict(interval=1, metric=['PCK', 'tPCK'], save_best='Mean PCK') + +optimizer = dict( + type='Adam', + lr=5e-4, +) +optimizer_config = dict(grad_clip=None) +# learning policy +lr_config = dict( + policy='step', + warmup='linear', + warmup_iters=500, + warmup_ratio=0.001, + step=[170, 200]) +total_epochs = 210 +log_config = dict( + interval=10, + hooks=[ + dict(type='TextLoggerHook'), + # dict(type='TensorboardLoggerHook') + ]) + +channel_cfg = dict( + num_output_channels=15, + dataset_joints=15, + dataset_channel=[ + [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14], + ], + inference_channel=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14]) + +# model settings +model = dict( + type='LSTMPoseMachine', + pretrained=None, + backbone=dict( + type='LSTM_PM', + out_channels=channel_cfg['num_output_channels'], + num_stages=5, + ), + keypoint_head=dict( + type='TopdownHeatmapMultiStageHead', + num_stages=5, + num_deconv_layers=0, + extra=dict(final_conv_kernel=0, ), + loss_keypoint=dict(type='JointsMSELoss', use_target_weight=True)), + train_cfg=dict(), + test_cfg=dict( + flip_test=False, + post_process='default', + shift_heatmap=True, + modulate_kernel=11)) + +data_cfg = dict( + image_size=[368, 368], + heatmap_size=[46, 46], + num_output_channels=channel_cfg['num_output_channels'], + num_joints=channel_cfg['dataset_joints'], + dataset_channel=channel_cfg['dataset_channel'], + inference_channel=channel_cfg['inference_channel'], + use_nms=True, + soft_nms=False, + nms_thr=1.0, + oks_thr=0.9, + vis_thr=0.2, + use_gt_bbox=True, + det_bbox_thr=0.0, + frame_indices_train=[0, 1, 2, 3, 4], + frame_indices_test=[0, 1, 2, 3, 4], + frame_interval_train=1, + frame_interval_test=5, +) + +# take care of orders of the transforms +train_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownRandomFlip', flip_prob=0.5), + dict( + type='TopDownGetRandomScaleRotation', rot_factor=30, + scale_factor=0.25), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict(type='TopDownGenerateTarget', sigma=2), + dict( + type='Collect', + keys=['img', 'target', 'target_weight'], + meta_keys=[ + 'image_file', 'joints_3d', 'joints_3d_visible', 'center', 'scale', + 'rotation', 'bbox_score', 'flip_pairs' + ]), +] + +val_pipeline = [ + dict(type='LoadImageFromFile'), + dict(type='TopDownAffine'), + dict(type='ToTensor'), + dict( + type='NormalizeTensor', + mean=[0.485, 0.456, 0.406], + std=[0.229, 0.224, 0.225]), + dict( + type='Collect', + keys=[ + 'img', + ], + meta_keys=[ + 'image_file', 'center', 'scale', 'rotation', 'bbox_score', + 'flip_pairs' + ]), +] + +test_pipeline = val_pipeline + +data_root = 'data/jhmdb' +data = dict( + samples_per_gpu=16, + workers_per_gpu=2, + val_dataloader=dict(samples_per_gpu=16), + test_dataloader=dict(samples_per_gpu=16), + train=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_train.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=train_pipeline, + dataset_info={{_base_.dataset_info}}), + val=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=val_pipeline, + dataset_info={{_base_.dataset_info}}), + test=dict( + type='TopDownJhmdbVideoDataset', + ann_file=f'{data_root}/annotations/Sub1_test.json', + img_prefix=f'{data_root}/', + data_cfg=data_cfg, + pipeline=test_pipeline, + dataset_info={{_base_.dataset_info}}), +)