diff --git a/.circleci/config.yml b/.circleci/config.yml
index 9456918aac..635a984a1c 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -1,161 +1,34 @@
 version: 2.1
 
-jobs:
-  lint:
-    docker:
-      - image: cimg/python:3.7.4
-    steps:
-      - checkout
-      - run:
-          name: Install dependencies
-          command: |
-            sudo apt-add-repository ppa:brightbox/ruby-ng -y
-            sudo apt-get update
-            sudo apt-get install -y ruby2.7
-      - run:
-          name: Install pre-commit hook
-          command: |
-            pip install pre-commit
-            pre-commit install
-      - run:
-          name: Linting
-          command: pre-commit run --all-files
-      - run:
-          name: Check docstring coverage
-          command: |
-            pip install interrogate
-            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-regex "__repr__" --fail-under 50 mmseg
+# this allows you to use CircleCI's dynamic configuration feature
+setup: true
 
-  build_cpu:
-    parameters:
-      # The python version must match available image tags in
-      # https://circleci.com/developer/images/image/cimg/python
-      python:
-        type: string
-        default: "3.7.4"
-      torch:
-        type: string
-      torchvision:
-        type: string
-    docker:
-      - image: cimg/python:<< parameters.python >>
-    resource_class: large
-    steps:
-      - checkout
-      - run:
-          name: Install Libraries
-          command: |
-            sudo apt-get update
-            sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5
-      - run:
-          name: Configure Python & pip
-          command: |
-            python -m pip install --upgrade pip
-            python -m pip install wheel
-      - run:
-          name: Install PyTorch
-          command: |
-            python -V
-            python -m pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
-      - run:
-          name: Install mmseg dependencies
-          command: |
-            python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/torch<< parameters.torch >>/index.html
-            python -m pip install mmdet
-            python -m pip install -r requirements.txt
-      - run:
-          name: Build and install
-          command: |
-            python -m pip install -e .
-      - run:
-          name: Run unittests
-          command: |
-            python -m pip install timm
-            python -m coverage run --branch --source mmseg -m pytest tests/
-            python -m coverage xml
-            python -m coverage report -m
-
-  build_cu101:
-    machine:
-      image: ubuntu-1604-cuda-10.1:201909-23
-    resource_class: gpu.nvidia.small
-    steps:
-      - checkout
-      - run:
-          name: Install Libraries
-          command: |
-            sudo apt-get update
-            sudo apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx
-      - run:
-          name: Configure Python & pip
-          command: |
-            pyenv global 3.7.0
-            python -m pip install --upgrade pip
-            python -m pip install wheel
-      - run:
-          name: Install PyTorch
-          command: |
-            python -V
-            python -m pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
-      - run:
-          name: Install mmseg dependencies
-          # python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/torch${{matrix.torch_version}}/index.html
-          command: |
-            python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/torch1.6.0/index.html
-            python -m pip install mmdet
-            python -m pip install -r requirements.txt
-      - run:
-          name: Build and install
-          command: |
-            python setup.py check -m -s
-            TORCH_CUDA_ARCH_LIST=7.0 python -m pip install -e .
-      - run:
-          name: Run unittests
-          command: |
-            python -m pip install timm
-            python -m pytest tests/
+# the path-filtering orb is required to continue a pipeline based on
+# the path of an updated fileset
+orbs:
+  path-filtering: circleci/path-filtering@0.1.2
 
 workflows:
-  unit_tests:
+  # the always-run workflow is always triggered, regardless of the pipeline parameters.
+  always-run:
     jobs:
-      - lint
-      - build_cpu:
-          name: build_cpu_th1.6
-          torch: 1.6.0
-          torchvision: 0.7.0
-          requires:
-            - lint
-      - build_cpu:
-          name: build_cpu_th1.7
-          torch: 1.7.0
-          torchvision: 0.8.1
-          requires:
-            - lint
-      - build_cpu:
-          name: build_cpu_th1.8_py3.9
-          torch: 1.8.0
-          torchvision: 0.9.0
-          python: "3.9.0"
-          requires:
-            - lint
-      - build_cpu:
-          name: build_cpu_th1.9_py3.8
-          torch: 1.9.0
-          torchvision: 0.10.0
-          python: "3.8.0"
-          requires:
-            - lint
-      - build_cpu:
-          name: build_cpu_th1.9_py3.9
-          torch: 1.9.0
-          torchvision: 0.10.0
-          python: "3.9.0"
-          requires:
-            - lint
-      - build_cu101:
-          requires:
-            - build_cpu_th1.6
-            - build_cpu_th1.7
-            - build_cpu_th1.8_py3.9
-            - build_cpu_th1.9_py3.8
-            - build_cpu_th1.9_py3.9
+      # the path-filtering/filter job determines which pipeline
+      # parameters to update.
+      - path-filtering/filter:
+          name: check-updated-files
+          # 3-column, whitespace-delimited mapping. One mapping per
+          # line:
+          # <regex path-to-test> <parameter-to-set> <value-of-pipeline-parameter>
+          mapping: |
+            mmseg/.* lint_only false
+            requirements/.* lint_only false
+            tests/.* lint_only false
+            tools/.* lint_only false
+            configs/.* lint_only false
+            .circleci/.* lint_only false
+          base-revision: main
+          # this is the path of the configuration we should trigger once
+          # path filtering and pipeline parameter value updates are
+          # complete. In this case, we are using the parent dynamic
+          # configuration itself.
+          config-path: .circleci/test.yml
diff --git a/.circleci/docker/Dockerfile b/.circleci/docker/Dockerfile
new file mode 100644
index 0000000000..b1d40e0e14
--- /dev/null
+++ b/.circleci/docker/Dockerfile
@@ -0,0 +1,12 @@
+
+ARG PYTORCH="1.8.1"
+ARG CUDA="10.2"
+ARG CUDNN="7"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+RUN apt-get update && apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx
diff --git a/.circleci/test.yml b/.circleci/test.yml
new file mode 100644
index 0000000000..622cdf9791
--- /dev/null
+++ b/.circleci/test.yml
@@ -0,0 +1,196 @@
+version: 2.1
+
+# the default pipeline parameters, which will be updated according to
+# the results of the path-filtering orb
+parameters:
+  lint_only:
+    type: boolean
+    default: true
+
+jobs:
+  lint:
+    docker:
+      - image: cimg/python:3.7.4
+    steps:
+      - checkout
+      - run:
+          name: Install pre-commit hook
+          command: |
+            pip install pre-commit
+            pre-commit install
+      - run:
+          name: Linting
+          command: pre-commit run --all-files
+      - run:
+          name: Check docstring coverage
+          command: |
+            pip install interrogate
+            interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --ignore-magic --ignore-regex "__repr__" --fail-under 75 mmseg
+  build_cpu:
+    parameters:
+      # The python version must match available image tags in
+      # https://circleci.com/developer/images/image/cimg/python
+      python:
+        type: string
+      torch:
+        type: string
+      torchvision:
+        type: string
+    docker:
+      - image: cimg/python:<< parameters.python >>
+    resource_class: large
+    steps:
+      - checkout
+      - run:
+          name: Install Libraries
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-glx libjpeg-dev zlib1g-dev libtinfo-dev libncurses5
+      - run:
+          name: Configure Python & pip
+          command: |
+            pip install --upgrade pip
+            pip install wheel
+      - run:
+          name: Install PyTorch
+          command: |
+            python -V
+            pip install torch==<< parameters.torch >>+cpu torchvision==<< parameters.torchvision >>+cpu -f https://download.pytorch.org/whl/torch_stable.html
+      - run:
+          name: Install mmseg dependencies
+          command: |
+            pip install git+https://github.com/open-mmlab/mmengine.git@main
+            pip install -U openmim
+            mim install mmcv>=2.0.0
+            pip install mmpretrain>=1.0.0rc7
+            pip install mmdet>=3.0.0
+            pip install -r requirements/tests.txt -r requirements/optional.txt
+            python -m pip install albumentations>=0.3.2 --no-binary qudida,albumentations
+      - run:
+          name: Build and install
+          command: |
+            pip install -e .
+      - run:
+          name: Skip timm unittests and generate coverage report
+          command: |
+            python -m coverage run --branch --source mmseg -m pytest tests/ --ignore tests/test_models/test_backbones/test_timm_backbone.py --ignore tests/test_apis/test_rs_inferencer.py
+            python -m coverage xml
+            python -m coverage report -m
+  build_cuda:
+    parameters:
+      torch:
+        type: string
+      cuda:
+        type: enum
+        enum: ["10.1", "10.2", "11.1"]
+      cudnn:
+        type: integer
+        default: 7
+    machine:
+      image: linux-cuda-11:default
+      docker_layer_caching: true
+    resource_class: gpu.nvidia.small.multi
+    steps:
+      - checkout
+      - run:
+          name: Install nvidia-container-toolkit and Restart Docker
+          command: |
+            sudo apt-get update
+            sudo apt-get install -y nvidia-container-toolkit
+            sudo systemctl restart docker
+      - run:
+          # Cloning repos in VM since Docker doesn't have access to the private key
+          name: Clone Repos
+          command: |
+            git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine
+      - run:
+          name: Build Docker image
+          command: |
+            docker build .circleci/docker -t mmseg:gpu --build-arg PYTORCH=<< parameters.torch >> --build-arg CUDA=<< parameters.cuda >> --build-arg CUDNN=<< parameters.cudnn >>
+            docker run --gpus all -t -d -v /home/circleci/project:/mmseg -v /home/circleci/mmengine:/mmengine -v /home/circleci/mmpretrain:/mmpretrain -v /home/circleci/mmdetection:/mmdetection -w /mmseg --name mmseg mmseg:gpu
+      - run:
+          name: Install mmseg dependencies
+          command: |
+            docker exec mmseg pip install -e /mmengine
+            docker exec mmseg pip install -U openmim
+            docker exec mmseg mim install mmcv>=2.0.0
+            docker exec mmseg pip install mmpretrain>=1.0.0rc7
+            docker exec mmseg mim install mmdet>=3.0.0
+            docker exec mmseg apt-get update
+            docker exec mmseg apt-get install -y git
+            docker exec mmseg pip install -r requirements/tests.txt -r requirements/optional.txt
+            docker exec mmseg python -m pip install albumentations>=0.3.2 --no-binary qudida,albumentations
+      - run:
+          name: Build and install
+          command: |
+            docker exec mmseg pip install -e .
+      - run:
+          name: Run unittests but skip timm unittests
+          command: |
+            docker exec mmseg pytest tests/ --ignore tests/test_models/test_backbones/test_timm_backbone.py --ignore tests/test_models/test_backbones/test_timm_backbone.py --ignore tests/test_apis/test_rs_inferencer.py
+workflows:
+  pr_stage_lint:
+    when: << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - dev-1.x
+                - main
+  pr_stage_test:
+    when:
+      not:
+        << pipeline.parameters.lint_only >>
+    jobs:
+      - lint:
+          name: lint
+          filters:
+            branches:
+              ignore:
+                - dev-1.x
+                - main
+      - build_cpu:
+          name: minimum_version_cpu
+          torch: 1.8.1
+          torchvision: 0.9.1
+          python: "3.7"
+          requires:
+            - lint
+      - build_cpu:
+          name: maximum_version_cpu
+          # TODO: Fix torch 1.13 forward crush
+          torch: 1.12.0
+          torchvision: 0.13.0
+          python: 3.9.0
+          requires:
+            - minimum_version_cpu
+      - hold:
+          type: approval
+          requires:
+            - maximum_version_cpu
+      - build_cuda:
+          name: mainstream_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "10.2"
+          requires:
+            - hold
+  merge_stage_test:
+    when:
+      not:
+        << pipeline.parameters.lint_only >>
+    jobs:
+      - build_cuda:
+          name: minimum_version_gpu
+          torch: 1.8.1
+          # Use double quotation mark to explicitly specify its type
+          # as string instead of number
+          cuda: "10.2"
+          filters:
+            branches:
+              only:
+                - dev-1.x
+                - main
diff --git a/.dev/batch_test_list.py b/.dev/batch_test_list.py
deleted file mode 100644
index c4fd8f97e4..0000000000
--- a/.dev/batch_test_list.py
+++ /dev/null
@@ -1,133 +0,0 @@
-# yapf: disable
-# Inference Speed is tested on NVIDIA V100
-hrnet = [
-    dict(
-        config='configs/hrnet/fcn_hr18s_512x512_160k_ade20k.py',
-        checkpoint='fcn_hr18s_512x512_160k_ade20k_20200614_214413-870f65ac.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=33.0),
-    ),
-    dict(
-        config='configs/hrnet/fcn_hr18s_512x1024_160k_cityscapes.py',
-        checkpoint='fcn_hr18s_512x1024_160k_cityscapes_20200602_190901-4a0797ea.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=76.31),
-    ),
-    dict(
-        config='configs/hrnet/fcn_hr48_512x512_160k_ade20k.py',
-        checkpoint='fcn_hr48_512x512_160k_ade20k_20200614_214407-a52fc02c.pth',
-        eval='mIoU',
-        metric=dict(mIoU=42.02),
-    ),
-    dict(
-        config='configs/hrnet/fcn_hr48_512x1024_160k_cityscapes.py',
-        checkpoint='fcn_hr48_512x1024_160k_cityscapes_20200602_190946-59b7973e.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=80.65),
-    ),
-]
-pspnet = [
-    dict(
-        config='configs/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes.py',
-        checkpoint='pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131-2376f12b.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=78.55),
-    ),
-    dict(
-        config='configs/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py',
-        checkpoint='pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211-e1e1100f.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=79.76),
-    ),
-    dict(
-        config='configs/pspnet/pspnet_r101-d8_512x512_160k_ade20k.py',
-        checkpoint='pspnet_r101-d8_512x512_160k_ade20k_20200615_100650-967c316f.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=44.39),
-    ),
-    dict(
-        config='configs/pspnet/pspnet_r50-d8_512x512_160k_ade20k.py',
-        checkpoint='pspnet_r50-d8_512x512_160k_ade20k_20200615_184358-1890b0bd.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=42.48),
-    ),
-]
-resnest = [
-    dict(
-        config='configs/resnest/pspnet_s101-d8_512x512_160k_ade20k.py',
-        checkpoint='pspnet_s101-d8_512x512_160k_ade20k_20200807_145416-a6daa92a.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=45.44),
-    ),
-    dict(
-        config='configs/resnest/pspnet_s101-d8_512x1024_80k_cityscapes.py',
-        checkpoint='pspnet_s101-d8_512x1024_80k_cityscapes_20200807_140631-c75f3b99.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=78.57),
-    ),
-]
-fastscnn = [
-    dict(
-        config='configs/fastscnn/fast_scnn_lr0.12_8x4_160k_cityscapes.py',
-        checkpoint='fast_scnn_8x4_160k_lr0.12_cityscapes-0cec9937.pth',
-        eval='mIoU',
-        metric=dict(mIoU=70.96),
-    )
-]
-deeplabv3plus = [
-    dict(
-        config='configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes.py', # noqa
-        checkpoint='deeplabv3plus_r101-d8_769x769_80k_cityscapes_20200607_000405-a7573d20.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=80.98),
-    ),
-    dict(
-        config='configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py', # noqa
-        checkpoint='deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143-068fcfe9.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=80.97),
-    ),
-    dict(
-        config='configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py', # noqa
-        checkpoint='deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049-f9fb496d.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=80.09),
-    ),
-    dict(
-        config='configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes.py', # noqa
-        checkpoint='deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233-0e9dfdc4.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=79.83),
-    ),
-]
-vit = [
-    dict(
-        config='configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py',
-        checkpoint='upernet_vit-b16_ln_mln_512x512_160k_ade20k-f444c077.pth',
-        eval='mIoU',
-        metric=dict(mIoU=47.73),
-    ),
-    dict(
-        config='configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py',
-        checkpoint='upernet_deit-s16_ln_mln_512x512_160k_ade20k-c0cd652f.pth',
-        eval='mIoU',
-        metric=dict(mIoU=43.52),
-    ),
-]
-fp16 = [
-    dict(
-        config='configs/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes.py', # noqa
-        checkpoint='deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-f1104f4b.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=80.46),
-    )
-]
-swin = [
-    dict(
-        config='configs/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py', # noqa
-        checkpoint='upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth', # noqa
-        eval='mIoU',
-        metric=dict(mIoU=44.41),
-    )
-]
-# yapf: enable
diff --git a/.dev/batch_train_list.txt b/.dev/batch_train_list.txt
deleted file mode 100644
index 17d19932e6..0000000000
--- a/.dev/batch_train_list.txt
+++ /dev/null
@@ -1,19 +0,0 @@
-configs/hrnet/fcn_hr18s_512x512_160k_ade20k.py
-configs/hrnet/fcn_hr18s_512x1024_160k_cityscapes.py
-configs/hrnet/fcn_hr48_512x512_160k_ade20k.py
-configs/hrnet/fcn_hr48_512x1024_160k_cityscapes.py
-configs/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes.py
-configs/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py
-configs/pspnet/pspnet_r101-d8_512x512_160k_ade20k.py
-configs/pspnet/pspnet_r50-d8_512x512_160k_ade20k.py
-configs/resnest/pspnet_s101-d8_512x512_160k_ade20k.py
-configs/resnest/pspnet_s101-d8_512x1024_80k_cityscapes.py
-configs/fastscnn/fast_scnn_lr0.12_8x4_160k_cityscapes.py
-configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes.py
-configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py
-configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py
-configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes.py
-configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py
-configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py
-configs/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes.py
-configs/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
diff --git a/.dev/md2yml.py b/.dev/md2yml.py
deleted file mode 100755
index 1d68498db2..0000000000
--- a/.dev/md2yml.py
+++ /dev/null
@@ -1,317 +0,0 @@
-#!/usr/bin/env python
-
-# Copyright (c) OpenMMLab. All rights reserved.
-# This tool is used to update model-index.yml which is required by MIM, and
-# will be automatically called as a pre-commit hook. The updating will be
-# triggered if any change of model information (.md files in configs/) has been
-# detected before a commit.
-
-import glob
-import os
-import os.path as osp
-import re
-import sys
-
-from lxml import etree
-from mmcv.fileio import dump
-
-MMSEG_ROOT = osp.dirname(osp.dirname((osp.dirname(__file__))))
-
-COLLECTIONS = [
-    'ANN', 'APCNet', 'BiSeNetV1', 'BiSeNetV2', 'CCNet', 'CGNet', 'DANet',
-    'DeepLabV3', 'DeepLabV3+', 'DMNet', 'DNLNet', 'DPT', 'EMANet', 'EncNet',
-    'ERFNet', 'FastFCN', 'FastSCNN', 'FCN', 'GCNet', 'ICNet', 'ISANet', 'KNet',
-    'NonLocalNet', 'OCRNet', 'PointRend', 'PSANet', 'PSPNet', 'Segformer',
-    'Segmenter', 'FPN', 'SETR', 'STDC', 'UNet', 'UPerNet'
-]
-COLLECTIONS_TEMP = []
-
-
-def dump_yaml_and_check_difference(obj, filename, sort_keys=False):
-    """Dump object to a yaml file, and check if the file content is different
-    from the original.
-
-    Args:
-        obj (any): The python object to be dumped.
-        filename (str): YAML filename to dump the object to.
-        sort_keys (str); Sort key by dictionary order.
-    Returns:
-        Bool: If the target YAML file is different from the original.
-    """
-
-    str_dump = dump(obj, None, file_format='yaml', sort_keys=sort_keys)
-    if osp.isfile(filename):
-        file_exists = True
-        with open(filename, 'r', encoding='utf-8') as f:
-            str_orig = f.read()
-    else:
-        file_exists = False
-        str_orig = None
-
-    if file_exists and str_orig == str_dump:
-        is_different = False
-    else:
-        is_different = True
-        with open(filename, 'w', encoding='utf-8') as f:
-            f.write(str_dump)
-
-    return is_different
-
-
-def parse_md(md_file):
-    """Parse .md file and convert it to a .yml file which can be used for MIM.
-
-    Args:
-        md_file (str): Path to .md file.
-    Returns:
-        Bool: If the target YAML file is different from the original.
-    """
-    collection_name = osp.split(osp.dirname(md_file))[1]
-    configs = os.listdir(osp.dirname(md_file))
-
-    collection = dict(
-        Name=collection_name,
-        Metadata={'Training Data': []},
-        Paper={
-            'URL': '',
-            'Title': ''
-        },
-        README=md_file,
-        Code={
-            'URL': '',
-            'Version': ''
-        })
-    collection.update({'Converted From': {'Weights': '', 'Code': ''}})
-    models = []
-    datasets = []
-    paper_url = None
-    paper_title = None
-    code_url = None
-    code_version = None
-    repo_url = None
-
-    # To avoid re-counting number of backbone model in OpenMMLab,
-    # if certain model in configs folder is backbone whose name is already
-    # recorded in MMClassification, then the `COLLECTION` dict of this model
-    # in MMSegmentation should be deleted, and `In Collection` in `Models`
-    # should be set with head or neck of this config file.
-    is_backbone = None
-
-    with open(md_file, 'r', encoding='UTF-8') as md:
-        lines = md.readlines()
-        i = 0
-        current_dataset = ''
-        while i < len(lines):
-            line = lines[i].strip()
-            # In latest README.md the title and url are in the third line.
-            if i == 2:
-                paper_url = lines[i].split('](')[1].split(')')[0]
-                paper_title = lines[i].split('](')[0].split('[')[1]
-            if len(line) == 0:
-                i += 1
-                continue
-            elif line[:3] == '<a ':
-                content = etree.HTML(line)
-                node = content.xpath('//a')[0]
-                if node.text == 'Code Snippet':
-                    code_url = node.get('href', None)
-                    assert code_url is not None, (
-                        f'{collection_name} hasn\'t code snippet url.')
-                    # version extraction
-                    filter_str = r'blob/(.*)/mm'
-                    pattern = re.compile(filter_str)
-                    code_version = pattern.findall(code_url)
-                    assert len(code_version) == 1, (
-                        f'false regular expression ({filter_str}) use.')
-                    code_version = code_version[0]
-                elif node.text == 'Official Repo':
-                    repo_url = node.get('href', None)
-                    assert repo_url is not None, (
-                        f'{collection_name} hasn\'t official repo url.')
-                i += 1
-            elif line[:4] == '### ':
-                datasets.append(line[4:])
-                current_dataset = line[4:]
-                i += 2
-            elif line[:15] == '<!-- [BACKBONE]':
-                is_backbone = True
-                i += 1
-            elif (line[0] == '|' and (i + 1) < len(lines)
-                  and lines[i + 1][:3] == '| -' and 'Method' in line
-                  and 'Crop Size' in line and 'Mem (GB)' in line):
-                cols = [col.strip() for col in line.split('|')]
-                method_id = cols.index('Method')
-                backbone_id = cols.index('Backbone')
-                crop_size_id = cols.index('Crop Size')
-                lr_schd_id = cols.index('Lr schd')
-                mem_id = cols.index('Mem (GB)')
-                fps_id = cols.index('Inf time (fps)')
-                try:
-                    ss_id = cols.index('mIoU')
-                except ValueError:
-                    ss_id = cols.index('Dice')
-                try:
-                    ms_id = cols.index('mIoU(ms+flip)')
-                except ValueError:
-                    ms_id = False
-                config_id = cols.index('config')
-                download_id = cols.index('download')
-                j = i + 2
-                while j < len(lines) and lines[j][0] == '|':
-                    els = [el.strip() for el in lines[j].split('|')]
-                    config = ''
-                    model_name = ''
-                    weight = ''
-                    for fn in configs:
-                        if fn in els[config_id]:
-                            left = els[download_id].index(
-                                'https://download.openmmlab.com')
-                            right = els[download_id].index('.pth') + 4
-                            weight = els[download_id][left:right]
-                            config = f'configs/{collection_name}/{fn}'
-                            model_name = fn[:-3]
-                    fps = els[fps_id] if els[fps_id] != '-' and els[
-                        fps_id] != '' else -1
-                    mem = els[mem_id].split(
-                        '\\'
-                    )[0] if els[mem_id] != '-' and els[mem_id] != '' else -1
-                    crop_size = els[crop_size_id].split('x')
-                    assert len(crop_size) == 2
-                    method = els[method_id].split()[0].split('-')[-1]
-                    model = {
-                        'Name':
-                        model_name,
-                        'In Collection':
-                        method,
-                        'Metadata': {
-                            'backbone': els[backbone_id],
-                            'crop size': f'({crop_size[0]},{crop_size[1]})',
-                            'lr schd': int(els[lr_schd_id]),
-                        },
-                        'Results': [
-                            {
-                                'Task': 'Semantic Segmentation',
-                                'Dataset': current_dataset,
-                                'Metrics': {
-                                    cols[ss_id]: float(els[ss_id]),
-                                },
-                            },
-                        ],
-                        'Config':
-                        config,
-                        'Weights':
-                        weight,
-                    }
-                    if fps != -1:
-                        try:
-                            fps = float(fps)
-                        except Exception:
-                            j += 1
-                            continue
-                        model['Metadata']['inference time (ms/im)'] = [{
-                            'value':
-                            round(1000 / float(fps), 2),
-                            'hardware':
-                            'V100',
-                            'backend':
-                            'PyTorch',
-                            'batch size':
-                            1,
-                            'mode':
-                            'FP32' if 'fp16' not in config else 'FP16',
-                            'resolution':
-                            f'({crop_size[0]},{crop_size[1]})'
-                        }]
-                    if mem != -1:
-                        model['Metadata']['Training Memory (GB)'] = float(mem)
-                    # Only have semantic segmentation now
-                    if ms_id and els[ms_id] != '-' and els[ms_id] != '':
-                        model['Results'][0]['Metrics'][
-                            'mIoU(ms+flip)'] = float(els[ms_id])
-                    models.append(model)
-                    j += 1
-                i = j
-            else:
-                i += 1
-    flag = (code_url is not None) and (paper_url is not None) and (repo_url
-                                                                   is not None)
-    assert flag, f'{collection_name} readme error'
-    collection['Name'] = method
-    collection['Metadata']['Training Data'] = datasets
-    collection['Code']['URL'] = code_url
-    collection['Code']['Version'] = code_version
-    collection['Paper']['URL'] = paper_url
-    collection['Paper']['Title'] = paper_title
-    collection['Converted From']['Code'] = repo_url
-    # ['Converted From']['Weights] miss
-    # remove empty attribute
-    check_key_list = ['Code', 'Paper', 'Converted From']
-    for check_key in check_key_list:
-        key_list = list(collection[check_key].keys())
-        for key in key_list:
-            if check_key not in collection:
-                break
-            if collection[check_key][key] == '':
-                if len(collection[check_key].keys()) == 1:
-                    collection.pop(check_key)
-                else:
-                    collection[check_key].pop(key)
-    yml_file = f'{md_file[:-9]}{collection_name}.yml'
-    if is_backbone:
-        if collection['Name'] not in COLLECTIONS:
-            result = {
-                'Collections': [collection],
-                'Models': models,
-                'Yml': yml_file
-            }
-            COLLECTIONS_TEMP.append(result)
-            return False
-        else:
-            result = {'Models': models}
-    else:
-        COLLECTIONS.append(collection['Name'])
-        result = {'Collections': [collection], 'Models': models}
-    return dump_yaml_and_check_difference(result, yml_file)
-
-
-def update_model_index():
-    """Update model-index.yml according to model .md files.
-
-    Returns:
-        Bool: If the updated model-index.yml is different from the original.
-    """
-    configs_dir = osp.join(MMSEG_ROOT, 'configs')
-    yml_files = glob.glob(osp.join(configs_dir, '**', '*.yml'), recursive=True)
-    yml_files.sort()
-
-    # add .replace('\\', '/') to avoid Windows Style path
-    model_index = {
-        'Import': [
-            osp.relpath(yml_file, MMSEG_ROOT).replace('\\', '/')
-            for yml_file in yml_files
-        ]
-    }
-    model_index_file = osp.join(MMSEG_ROOT, 'model-index.yml')
-    is_different = dump_yaml_and_check_difference(model_index,
-                                                  model_index_file)
-
-    return is_different
-
-
-if __name__ == '__main__':
-    file_list = [fn for fn in sys.argv[1:] if osp.basename(fn) == 'README.md']
-    if not file_list:
-        sys.exit(0)
-    file_modified = False
-    for fn in file_list:
-        file_modified |= parse_md(fn)
-
-    for result in COLLECTIONS_TEMP:
-        collection = result['Collections'][0]
-        yml_file = result.pop('Yml', None)
-        if collection['Name'] in COLLECTIONS:
-            result.pop('Collections')
-        file_modified |= dump_yaml_and_check_difference(result, yml_file)
-
-    file_modified |= update_model_index()
-    sys.exit(1 if file_modified else 0)
diff --git a/.dev_scripts/batch_test_list.py b/.dev_scripts/batch_test_list.py
new file mode 100644
index 0000000000..0d096ed943
--- /dev/null
+++ b/.dev_scripts/batch_test_list.py
@@ -0,0 +1,133 @@
+# yapf: disable
+# Inference Speed is tested on NVIDIA V100
+hrnet = [
+    dict(
+        config='configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py',
+        checkpoint='fcn_hr18s_512x512_160k_ade20k_20200614_214413-870f65ac.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=33.0),
+    ),
+    dict(
+        config='configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py',
+        checkpoint='fcn_hr18s_512x1024_160k_cityscapes_20200602_190901-4a0797ea.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=76.31),
+    ),
+    dict(
+        config='configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py',
+        checkpoint='fcn_hr48_512x512_160k_ade20k_20200614_214407-a52fc02c.pth',
+        eval='mIoU',
+        metric=dict(mIoU=42.02),
+    ),
+    dict(
+        config='configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py',
+        checkpoint='fcn_hr48_512x1024_160k_cityscapes_20200602_190946-59b7973e.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=80.65),
+    ),
+]
+pspnet = [
+    dict(
+        config='configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py',
+        checkpoint='pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131-2376f12b.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=78.55),
+    ),
+    dict(
+        config='configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py',
+        checkpoint='pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211-e1e1100f.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=79.76),
+    ),
+    dict(
+        config='configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py',
+        checkpoint='pspnet_r101-d8_512x512_160k_ade20k_20200615_100650-967c316f.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=44.39),
+    ),
+    dict(
+        config='configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py',
+        checkpoint='pspnet_r50-d8_512x512_160k_ade20k_20200615_184358-1890b0bd.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=42.48),
+    ),
+]
+resnest = [
+    dict(
+        config='configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py', # noqa
+        checkpoint='pspnet_s101-d8_512x512_160k_ade20k_20200807_145416-a6daa92a.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=45.44),
+    ),
+    dict(
+        config='configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py', # noqa
+        checkpoint='pspnet_s101-d8_512x1024_80k_cityscapes_20200807_140631-c75f3b99.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=78.57),
+    ),
+]
+fastscnn = [
+    dict(
+        config='configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py',
+        checkpoint='fast_scnn_8x4_160k_lr0.12_cityscapes-0cec9937.pth',
+        eval='mIoU',
+        metric=dict(mIoU=70.96),
+    )
+]
+deeplabv3plus = [
+    dict(
+        config='configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py', # noqa
+        checkpoint='deeplabv3plus_r101-d8_769x769_80k_cityscapes_20200607_000405-a7573d20.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=80.98),
+    ),
+    dict(
+        config='configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py', # noqa
+        checkpoint='deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143-068fcfe9.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=80.97),
+    ),
+    dict(
+        config='configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py', # noqa
+        checkpoint='deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049-f9fb496d.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=80.09),
+    ),
+    dict(
+        config='configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py', # noqa
+        checkpoint='deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233-0e9dfdc4.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=79.83),
+    ),
+]
+vit = [
+    dict(
+        config='configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py', # noqa
+        checkpoint='upernet_vit-b16_ln_mln_512x512_160k_ade20k-f444c077.pth',
+        eval='mIoU',
+        metric=dict(mIoU=47.73),
+    ),
+    dict(
+        config='configs/vit/vit_deit-s16-ln_mln_upernet_512x512_160k_ade20k-512x512.py', # noqa
+        checkpoint='upernet_deit-s16_ln_mln_512x512_160k_ade20k-c0cd652f.pth',
+        eval='mIoU',
+        metric=dict(mIoU=43.52),
+    ),
+]
+fp16 = [
+    dict(
+        config='configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py', # noqa
+        checkpoint='deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-f1104f4b.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=80.46),
+    )
+]
+swin = [
+    dict(
+        config='configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py', # noqa
+        checkpoint='upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth', # noqa
+        eval='mIoU',
+        metric=dict(mIoU=44.41),
+    )
+]
+# yapf: enable
diff --git a/.dev_scripts/batch_train_list.txt b/.dev_scripts/batch_train_list.txt
new file mode 100644
index 0000000000..6c1a122dc4
--- /dev/null
+++ b/.dev_scripts/batch_train_list.txt
@@ -0,0 +1,19 @@
+configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py
+configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py
+configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py
+configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py
+configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py
+configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py
+configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py
+configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py
+configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py
+configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py
+configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py
+configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py
+configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py
+configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
+configs/vit/vit_deit-s16-ln_mln_upernet_512x512_160k_ade20k-512x512.py
+configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
diff --git a/.dev/benchmark_evaluation.sh b/.dev_scripts/benchmark_evaluation.sh
similarity index 100%
rename from .dev/benchmark_evaluation.sh
rename to .dev_scripts/benchmark_evaluation.sh
diff --git a/.dev_scripts/benchmark_full_models.txt b/.dev_scripts/benchmark_full_models.txt
new file mode 100644
index 0000000000..64b968d0f3
--- /dev/null
+++ b/.dev_scripts/benchmark_full_models.txt
@@ -0,0 +1,57 @@
+ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py
+apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py
+beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py
+bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py
+bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py
+ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py
+convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py
+danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
+deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py
+dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py
+dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py
+emanet/eemanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py
+erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
+fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
+fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py
+fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py
+gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py
+icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py
+isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py
+mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py
+mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
+mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
+mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
+mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
+maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
+maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
+maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
+mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py
+mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py
+nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py
+ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
+pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py
+point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py
+poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py
+psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py
+pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py
+resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
+segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py
+segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py
+segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py
+segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py
+sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py
+setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py
+stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py
+swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
+twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py
+unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py
+upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py
+vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py
+san/san-vit-b16_coco-stuff164k-640x640.py
diff --git a/.dev/benchmark_inference.py b/.dev_scripts/benchmark_inference.py
similarity index 99%
rename from .dev/benchmark_inference.py
rename to .dev_scripts/benchmark_inference.py
index 9e236f7b68..b17c144aed 100644
--- a/.dev/benchmark_inference.py
+++ b/.dev_scripts/benchmark_inference.py
@@ -7,7 +7,7 @@
 from argparse import ArgumentParser
 
 import requests
-from mmcv import Config
+from mmengine import Config
 
 from mmseg.apis import inference_model, init_model, show_result_pyplot
 from mmseg.utils import get_root_logger
diff --git a/.dev_scripts/benchmark_options.py b/.dev_scripts/benchmark_options.py
new file mode 100644
index 0000000000..51909a091c
--- /dev/null
+++ b/.dev_scripts/benchmark_options.py
@@ -0,0 +1,10 @@
+third_part_libs = [
+    'pip install mmengine',
+    'pip install mmcv>=2.0.0',
+    'pip install mmcls==1.0.0rc6',
+    'pip install mmdet==3.0.0',
+    'pip install -r requirements.txt',
+    'pip install timm',
+]
+
+default_floating_range = 0.5
diff --git a/.dev/benchmark_train.sh b/.dev_scripts/benchmark_train.sh
similarity index 100%
rename from .dev/benchmark_train.sh
rename to .dev_scripts/benchmark_train.sh
diff --git a/.dev_scripts/benchmark_train_models.txt b/.dev_scripts/benchmark_train_models.txt
new file mode 100644
index 0000000000..01f279d8d6
--- /dev/null
+++ b/.dev_scripts/benchmark_train_models.txt
@@ -0,0 +1,26 @@
+bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py
+bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py
+deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
+deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py
+deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py
+deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py
+hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py
+hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py
+knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py
+knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py
+mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py
+mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
+mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
+ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py
+pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py
+pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py
+pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py
+segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py
+segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py
+segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py
+swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
+twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
+unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py
+upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py
+san/san-vit-b16_coco-stuff164k-640x640.py
diff --git a/.dev/check_urls.py b/.dev_scripts/check_urls.py
similarity index 97%
rename from .dev/check_urls.py
rename to .dev_scripts/check_urls.py
index 42b64745de..58a1354ba5 100644
--- a/.dev/check_urls.py
+++ b/.dev_scripts/check_urls.py
@@ -56,8 +56,7 @@ def main():
 
     for model_name, yml_path in yml_list:
         # Default yaml loader unsafe.
-        model_infos = yml.load(
-            open(yml_path, 'r'), Loader=yml.CLoader)['Models']
+        model_infos = yml.load(open(yml_path), Loader=yml.CLoader)['Models']
         for model_info in model_infos:
             config_name = model_info['Name']
             checkpoint_url = model_info['Weights']
diff --git a/.dev/gather_benchmark_evaluation_results.py b/.dev_scripts/gather_benchmark_evaluation_results.py
similarity index 95%
rename from .dev/gather_benchmark_evaluation_results.py
rename to .dev_scripts/gather_benchmark_evaluation_results.py
index 47b557a105..fec83f133a 100644
--- a/.dev/gather_benchmark_evaluation_results.py
+++ b/.dev_scripts/gather_benchmark_evaluation_results.py
@@ -3,8 +3,8 @@
 import glob
 import os.path as osp
 
-import mmcv
-from mmcv import Config
+from mmengine import Config
+from mmengine.fileio import dump, load
 
 
 def parse_args():
@@ -56,7 +56,7 @@ def parse_args():
                 continue
 
             log_json_path = list(sorted(json_list))[-1]
-            metric = mmcv.load(log_json_path)
+            metric = load(log_json_path)
             if config not in metric.get('config', {}):
                 print(f'{config} not included in {log_json_path}')
                 continue
@@ -84,7 +84,7 @@ def parse_args():
                 new=new_metrics)
 
     if metrics_out:
-        mmcv.dump(result_dict, metrics_out, indent=4)
+        dump(result_dict, metrics_out, indent=4)
     print('===================================')
     for config_name, metrics in result_dict.items():
         print(config_name, metrics)
diff --git a/.dev/gather_benchmark_train_results.py b/.dev_scripts/gather_benchmark_train_results.py
similarity index 95%
rename from .dev/gather_benchmark_train_results.py
rename to .dev_scripts/gather_benchmark_train_results.py
index 8aff2c4228..f801a0dde5 100644
--- a/.dev/gather_benchmark_train_results.py
+++ b/.dev_scripts/gather_benchmark_train_results.py
@@ -2,9 +2,9 @@
 import glob
 import os.path as osp
 
-import mmcv
 from gather_models import get_final_results
-from mmcv import Config
+from mmengine import Config
+from mmengine.fileio import dump
 
 
 def parse_args():
@@ -50,7 +50,7 @@ def parse_args():
                 continue
 
             # parse config
-            cfg = mmcv.Config.fromfile(config)
+            cfg = Config.fromfile(config)
             total_iters = cfg.runner.max_iters
             exp_metric = cfg.evaluation.metric
             if not isinstance(exp_metric, list):
@@ -93,7 +93,7 @@ def parse_args():
 
     # 4 save or print results
     if metrics_out:
-        mmcv.dump(result_dict, metrics_out, indent=4)
+        dump(result_dict, metrics_out, indent=4)
     print('===================================')
     for config_name, metrics in result_dict.items():
         print(config_name, metrics)
diff --git a/.dev/gather_models.py b/.dev_scripts/gather_models.py
similarity index 93%
rename from .dev/gather_models.py
rename to .dev_scripts/gather_models.py
index 3eedf6110b..fe6c3901c8 100644
--- a/.dev/gather_models.py
+++ b/.dev_scripts/gather_models.py
@@ -7,8 +7,10 @@
 import os.path as osp
 import shutil
 
-import mmcv
 import torch
+from mmengine import Config
+from mmengine.fileio import dump
+from mmengine.utils import mkdir_or_exist, scandir
 
 # build schedule look-up table to automatically find the final model
 RESULTS_LUT = ['mIoU', 'mAcc', 'aAcc']
@@ -33,7 +35,7 @@ def process_checkpoint(in_file, out_file):
     # The hash code calculation and rename command differ on different system
     # platform.
     sha = calculate_file_sha256(out_file)
-    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
+    final_file = out_file.rstrip('.pth') + f'-{sha[:8]}.pth'
     os.rename(out_file, final_file)
 
     # Remove prefix and suffix
@@ -52,7 +54,7 @@ def get_final_iter(config):
 def get_final_results(log_json_path, iter_num):
     result_dict = dict()
     last_iter = 0
-    with open(log_json_path, 'r') as f:
+    with open(log_json_path) as f:
         for line in f.readlines():
             log_line = json.loads(line)
             if 'mode' not in log_line.keys():
@@ -100,10 +102,10 @@ def main():
     work_dir = args.work_dir
     collect_dir = args.collect_dir
     selected_config_name = args.config_name
-    mmcv.mkdir_or_exist(collect_dir)
+    mkdir_or_exist(collect_dir)
 
     # find all models in the root directory to be gathered
-    raw_configs = list(mmcv.scandir('./configs', '.py', recursive=True))
+    raw_configs = list(scandir('./configs', '.py', recursive=True))
 
     # filter configs that is not trained in the experiments dir
     used_configs = []
@@ -123,7 +125,7 @@ def main():
         exp_dir = osp.join(work_dir, config_name)
         # check whether the exps is finished
         final_iter = get_final_iter(used_config)
-        final_model = 'iter_{}.pth'.format(final_iter)
+        final_model = f'iter_{final_iter}.pth'
         model_path = osp.join(exp_dir, final_model)
 
         # skip if the model is still training
@@ -175,7 +177,7 @@ def main():
                 print(f'dir {model_publish_dir} exists, no model found')
 
         else:
-            mmcv.mkdir_or_exist(model_publish_dir)
+            mkdir_or_exist(model_publish_dir)
 
             # convert model
             final_model_path = process_checkpoint(trained_model_path,
@@ -198,13 +200,13 @@ def main():
         if args.all:
             # copy config to guarantee reproducibility
             raw_config = osp.join('./configs', f'{config_name}.py')
-            mmcv.Config.fromfile(raw_config).dump(
+            Config.fromfile(raw_config).dump(
                 osp.join(model_publish_dir, osp.basename(raw_config)))
 
         publish_model_infos.append(model)
 
     models = dict(models=publish_model_infos)
-    mmcv.dump(models, osp.join(collect_dir, 'model_infos.json'), indent=4)
+    dump(models, osp.join(collect_dir, 'model_infos.json'), indent=4)
 
 
 if __name__ == '__main__':
diff --git a/.dev/generate_benchmark_evaluation_script.py b/.dev_scripts/generate_benchmark_evaluation_script.py
similarity index 97%
rename from .dev/generate_benchmark_evaluation_script.py
rename to .dev_scripts/generate_benchmark_evaluation_script.py
index d86e94bc8f..4c48f85420 100644
--- a/.dev/generate_benchmark_evaluation_script.py
+++ b/.dev_scripts/generate_benchmark_evaluation_script.py
@@ -2,7 +2,7 @@
 import argparse
 import os.path as osp
 
-from mmcv import Config
+from mmengine import Config
 
 
 def parse_args():
@@ -17,7 +17,7 @@ def parse_args():
     parser.add_argument(
         '--out',
         type=str,
-        default='.dev/benchmark_evaluation.sh',
+        default='.dev_scripts/benchmark_evaluation.sh',
         help='path to save model benchmark script')
 
     args = parser.parse_args()
diff --git a/.dev/generate_benchmark_train_script.py b/.dev_scripts/generate_benchmark_train_script.py
similarity index 96%
rename from .dev/generate_benchmark_train_script.py
rename to .dev_scripts/generate_benchmark_train_script.py
index 6e8a0ae311..4bfdfbf09f 100644
--- a/.dev/generate_benchmark_train_script.py
+++ b/.dev_scripts/generate_benchmark_train_script.py
@@ -19,7 +19,7 @@ def parse_args():
     parser.add_argument(
         '--out',
         type=str,
-        default='.dev/benchmark_train.sh',
+        default='.dev_scripts/benchmark_train.sh',
         help='path to save model benchmark script')
 
     args = parser.parse_args()
@@ -74,7 +74,7 @@ def main():
     commands.append('\n')
     commands.append('\n')
 
-    with open(args.txt_path, 'r') as f:
+    with open(args.txt_path) as f:
         model_cfgs = f.readlines()
         for i, cfg in enumerate(model_cfgs):
             create_train_bash_info(commands, cfg, script_name, '$PARTITION',
diff --git a/.dev/log_collector/example_config.py b/.dev_scripts/log_collector/example_config.py
similarity index 100%
rename from .dev/log_collector/example_config.py
rename to .dev_scripts/log_collector/example_config.py
diff --git a/.dev/log_collector/log_collector.py b/.dev_scripts/log_collector/log_collector.py
similarity index 98%
rename from .dev/log_collector/log_collector.py
rename to .dev_scripts/log_collector/log_collector.py
index d0f4080877..0c2ff61880 100644
--- a/.dev/log_collector/log_collector.py
+++ b/.dev_scripts/log_collector/log_collector.py
@@ -86,7 +86,7 @@ def main():
         val_list = []
         last_iter = 0
         for log_name in log_list:
-            with open(os.path.join(preceding_path, log_name), 'r') as f:
+            with open(os.path.join(preceding_path, log_name)) as f:
                 # ignore the info line
                 f.readline()
                 all_lines = f.readlines()
diff --git a/.dev/log_collector/readme.md b/.dev_scripts/log_collector/readme.md
similarity index 100%
rename from .dev/log_collector/readme.md
rename to .dev_scripts/log_collector/readme.md
diff --git a/.dev/log_collector/utils.py b/.dev_scripts/log_collector/utils.py
similarity index 100%
rename from .dev/log_collector/utils.py
rename to .dev_scripts/log_collector/utils.py
diff --git a/.dev_scripts/update_model_index.py b/.dev_scripts/update_model_index.py
new file mode 100755
index 0000000000..eb87c02f17
--- /dev/null
+++ b/.dev_scripts/update_model_index.py
@@ -0,0 +1,301 @@
+#!/usr/bin/env python
+
+# Copyright (c) OpenMMLab. All rights reserved.
+# This tool is used to update model-index.yml which is required by MIM, and
+# will be automatically called as a pre-commit hook. The updating will be
+# triggered if any change of model information (.md files in configs/) has been
+# detected before a commit.
+
+import os
+import os.path as osp
+import re
+import sys
+from typing import List, Tuple
+
+import yaml
+
+MMSEG_ROOT = osp.abspath(osp.join(osp.dirname(__file__), '..'))
+
+
+def get_collection_name_list(md_file_list: List[str]) -> List[str]:
+    """Get the list of collection names."""
+    collection_name_list: List[str] = []
+    for md_file in md_file_list:
+        with open(md_file) as f:
+            lines = f.readlines()
+            collection_name = lines[0].split('#')[1].strip()
+            collection_name_list.append(collection_name)
+    return collection_name_list
+
+
+def get_md_file_list() -> Tuple[List[str], List[str]]:
+    """Get the list of md files."""
+    md_file_list: List[str] = []
+    md_dir_list: List[str] = []
+    for root, _, files in os.walk(osp.join(MMSEG_ROOT, 'configs')):
+        for file in files:
+            if file.endswith('.md'):
+                md_file_list.append(osp.join(root, file))
+                md_dir_list.append(root)
+                break
+    return md_file_list, md_dir_list
+
+
+def get_model_info(md_file: str, config_dir: str,
+                   collection_name_list: List[str]) -> Tuple[dict, str]:
+    """Get model information from md file."""
+    datasets: List[str] = []
+    models: List[dict] = []
+    current_dataset: str = ''
+    paper_name: str = ''
+    paper_url: str = ''
+    code_url: str = ''
+    is_backbone: bool = False
+    is_dataset: bool = False
+    collection_name: str = ''
+    with open(md_file) as f:
+        lines: List[str] = f.readlines()
+        i: int = 0
+
+        while i < len(lines):
+            line: str = lines[i].strip()
+            if len(line) == 0:
+                i += 1
+                continue
+            # get paper name and url
+            if re.match(r'> \[.*\]+\([a-zA-Z]+://[^\s]*\)', line):
+                paper_info = line.split('](')
+                paper_name = paper_info[0][paper_info[0].index('[') + 1:]
+                paper_url = paper_info[1][:len(paper_info[1]) - 1]
+
+            # get code info
+            if 'Code Snippet' in line:
+                code_url = line.split('"')[1].split('"')[0]
+
+            if line.startswith('<!-- [BACKBONE]'):
+                is_backbone = True
+
+            if line.startswith('<!-- [DATASET]'):
+                is_dataset = True
+
+            if '<!-- [SKIP DEV CHECK] -->' in line:
+                return None, None
+
+            # get dataset names
+            if line.startswith('###'):
+                current_dataset = line.split('###')[1].strip()
+                datasets.append(current_dataset)
+
+            # get model info key id
+            if (line[0] == '|' and (i + 1) < len(lines)
+                    and lines[i + 1][:3] == '| -' and 'Method' in line
+                    and 'Crop Size' in line and 'Mem (GB)' in line):
+                keys: List[str] = [key.strip() for key in line.split('|')]
+                crop_size_idx: int = keys.index('Crop Size')
+                mem_idx: int = keys.index('Mem (GB)')
+                assert 'Device' in keys, f'No Device in {md_file}'
+                device_idx: int = keys.index('Device')
+
+                if 'mIoU' in keys:
+                    ss_idx = keys.index('mIoU')
+                elif 'mDice' in keys:
+                    ss_idx = keys.index('mDice')
+                else:
+                    raise ValueError(f'No mIoU or mDice in {md_file}')
+                if 'mIoU(ms+flip)' in keys:
+                    ms_idx = keys.index('mIoU(ms+flip)')
+                elif 'Dice' in keys:
+                    ms_idx = keys.index('Dice')
+                else:
+                    ms_idx = -1
+                config_idx = keys.index('config')
+                download_idx = keys.index('download')
+                j: int = i + 2
+                while j < len(lines) and lines[j][0] == '|':
+                    values = [value.strip() for value in lines[j].split('|')]
+                    # get config name
+                    try:
+                        config_url = re.findall(r'[a-zA-Z]+://[^\s]*py',
+                                                values[config_idx])[0]
+                        config_name = config_url.split('/')[-1]
+                        model_name = config_name.replace('.py', '')
+                    except IndexError:
+                        raise ValueError(
+                            f'config url is not found in {md_file}')
+
+                    # get model name
+                    try:
+                        weight_url = re.findall(r'[a-zA-Z]+://[^\s]*pth',
+                                                values[download_idx])[0]
+                        log_url = re.findall(r'[a-zA-Z]+://[^\s]*.json',
+                                             values[download_idx + 1])[0]
+                    except IndexError:
+                        raise ValueError(
+                            f'url is not found in {values[download_idx]}')
+
+                    # get batch size
+                    bs = re.findall(r'[0-9]*xb[0-9]*',
+                                    config_name)[0].split('xb')
+                    batch_size = int(bs[0]) * int(bs[1])
+
+                    # get crop size
+                    crop_size = values[crop_size_idx].split('x')
+                    crop_size = [int(crop_size[0]), int(crop_size[1])]
+
+                    mem = values[mem_idx].split('\\')[0] if values[
+                        mem_idx] != '-' and values[mem_idx] != '' else -1
+
+                    method = values[keys.index('Method')].strip()
+                    # method = [method.strip()] if '+' not in method else [
+                    #     m.strip() for m in method.split('+')
+                    # ]
+                    # split method name:
+                    if ' + ' in method:
+                        method = [m.strip() for m in method.split(' + ')]
+                    elif ' ' in method:
+                        method = [m for m in method.split(' ')]
+                    else:
+                        method = [method]
+                    backone: str = re.findall(
+                        r'[^\s]*', values[keys.index('Backbone')].strip())[0]
+                    archs = [backone] + method
+                    collection_name = method[0]
+                    config_path = osp.join('configs',
+                                           config_dir.split('/')[-1],
+                                           config_name)
+                    model = {
+                        'Name': model_name,
+                        'In Collection': collection_name,
+                        'Results': {
+                            'Task': 'Semantic Segmentation',
+                            'Dataset': current_dataset,
+                            'Metrics': {
+                                keys[ss_idx]: float(values[ss_idx])
+                            }
+                        },
+                        'Config': config_path,
+                        'Metadata': {
+                            'Training Data':
+                            current_dataset,
+                            'Batch Size':
+                            batch_size,
+                            'Architecture':
+                            archs,
+                            'Training Resources':
+                            f'{bs[0]}x {values[device_idx]} GPUS',
+                        },
+                        'Weights': weight_url,
+                        'Training log': log_url,
+                        'Paper': {
+                            'Title': paper_name,
+                            'URL': paper_url
+                        },
+                        'Code': code_url,
+                        'Framework': 'PyTorch'
+                    }
+                    if ms_idx != -1 and values[ms_idx] != '-' and values[
+                            ms_idx] != '':
+                        model['Results']['Metrics'].update(
+                            {keys[ms_idx]: float(values[ms_idx])})
+                    if mem != -1:
+                        model['Metadata']['Memory (GB)'] = float(mem)
+                    models.append(model)
+                    j += 1
+                i = j
+            i += 1
+
+    if not (is_dataset
+            or is_backbone) or collection_name not in collection_name_list:
+        collection = {
+            'Name': collection_name,
+            'License': 'Apache License 2.0',
+            'Metadata': {
+                'Training Data': datasets
+            },
+            'Paper': {
+                'Title': paper_name,
+                'URL': paper_url,
+            },
+            'README': osp.join('configs',
+                               config_dir.split('/')[-1], 'README.md'),
+            'Frameworks': ['PyTorch'],
+        }
+        results = {
+            'Collections': [collection],
+            'Models': models
+        }, collection_name
+    else:
+        results = {'Models': models}, ''
+
+    return results
+
+
+def dump_yaml_and_check_difference(model_info: dict, filename: str) -> bool:
+    """dump yaml file and check difference with the original file.
+
+    Args:
+        model_info (dict): model info dict.
+        filename (str): filename to save.
+    """
+    str_dump = yaml.dump(model_info, sort_keys=False)
+    if osp.isfile(filename):
+        file_exist = True
+        with open(filename, encoding='utf-8') as f:
+            str_orig = f.read()
+    else:
+        str_orig = None
+        file_exist = False
+
+    if file_exist and str_orig == str_dump:
+        is_different = False
+    else:
+        is_different = True
+        with open(filename, 'w', encoding='utf-8') as f:
+            f.write(str_dump)
+
+    return is_different
+
+
+def update_model_index(config_dir_list: List[str]) -> bool:
+    """update model index."""
+    yml_files = [
+        osp.join('configs',
+                 dir_name.split('/')[-1], 'metafile.yaml')
+        for dir_name in config_dir_list
+    ]
+    yml_files.sort()
+
+    model_index = {
+        'Import': [
+            osp.relpath(yml_file, MMSEG_ROOT).replace('\\', '/')
+            for yml_file in yml_files
+        ]
+    }
+    model_index_file = osp.join(MMSEG_ROOT, 'model-index.yml')
+    return dump_yaml_and_check_difference(model_index, model_index_file)
+
+
+if __name__ == '__main__':
+    # get md file list
+    md_file_list, config_dir_list = get_md_file_list()
+    file_modified = False
+    collection_name_list: List[str] = get_collection_name_list(md_file_list)
+    # hard code to add 'FPN'
+    collection_name_list.append('FPN')
+    remove_config_dir_list = []
+    # parse md file
+    for md_file, config_dir in zip(md_file_list, config_dir_list):
+        results, collection_name = get_model_info(md_file, config_dir,
+                                                  collection_name_list)
+        if results is None:
+            remove_config_dir_list.append(config_dir)
+            continue
+        filename = osp.join(config_dir, 'metafile.yaml')
+        file_modified |= dump_yaml_and_check_difference(results, filename)
+        if collection_name != '':
+            collection_name_list.append(collection_name)
+    # remove config dir
+    for config_dir in remove_config_dir_list:
+        config_dir_list.remove(config_dir)
+    file_modified |= update_model_index(config_dir_list)
+    sys.exit(1 if file_modified else 0)
diff --git a/.dev/upload_modelzoo.py b/.dev_scripts/upload_modelzoo.py
similarity index 100%
rename from .dev/upload_modelzoo.py
rename to .dev_scripts/upload_modelzoo.py
diff --git a/.github/CONTRIBUTING.md b/.github/CONTRIBUTING.md
index 9a7a7ea57b..8865fa84cd 100644
--- a/.github/CONTRIBUTING.md
+++ b/.github/CONTRIBUTING.md
@@ -1,23 +1,21 @@
-# Contributing to mmsegmentation
+# Contributing to MMSegmentation 1.x
 
 All kinds of contributions are welcome, including but not limited to the following.
 
-- Fixes (typo, bugs)
-- New features and components
+- Fix typo or bugs
+- Add documentation or translate the documentation into other languages
+- Add new features and components
 
 ## Workflow
 
-1. fork and pull the latest mmsegmentation
-2. checkout a new branch (do not use master branch for PRs)
+1. fork and pull the latest MMSegmentation repository
+2. checkout a new branch from 'dev-1.x' (do not use master branch for PRs)
 3. commit your changes
 4. create a PR
 
-:::{note}
-
-- If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first.
-- If you are the author of some papers and would like to include your method to mmsegmentation,
-  please contact Kai Chen (chenkaidev\[at\]gmail\[dot\]com). We will much appreciate your contribution.
-  :::
+```{note}
+If you plan to add some new features that involve large changes, it is encouraged to open an issue for discussion first.
+```
 
 ## Code style
 
@@ -27,15 +25,18 @@ We adopt [PEP8](https://www.python.org/dev/peps/pep-0008/) as the preferred code
 
 We use the following tools for linting and formatting:
 
-- [flake8](http://flake8.pycqa.org/en/latest/): linter
-- [yapf](https://github.com/google/yapf): formatter
-- [isort](https://github.com/timothycrosley/isort): sort imports
+- [flake8](https://github.com/PyCQA/flake8): A wrapper around some linter tools.
+- [isort](https://github.com/timothycrosley/isort): A Python utility to sort imports.
+- [yapf](https://github.com/google/yapf): A formatter for Python files.
+- [codespell](https://github.com/codespell-project/codespell): A Python utility to fix common misspellings in text files.
+- [mdformat](https://github.com/executablebooks/mdformat): Mdformat is an opinionated Markdown formatter that can be used to enforce a consistent style in Markdown files.
+- [docformatter](https://github.com/myint/docformatter): A formatter to format docstring.
 
-Style configurations of yapf and isort can be found in [setup.cfg](../setup.cfg) and [.isort.cfg](../.isort.cfg).
+Style configurations of yapf and isort can be found in [setup.cfg](./setup.cfg).
 
-We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`,
-fixes `end-of-files`, sorts `requirments.txt` automatically on every commit.
-The config for a pre-commit hook is stored in [.pre-commit-config](../.pre-commit-config.yaml).
+We use [pre-commit hook](https://pre-commit.com/) that checks and formats for `flake8`, `yapf`, `isort`, `trailing whitespaces`, `markdown files`,
+fixes `end-of-files`, `double-quoted-strings`, `python-encoding-pragma`, `mixed-line-ending`, sorts `requirments.txt` automatically on every commit.
+The config for a pre-commit hook is stored in [.pre-commit-config](./.pre-commit-config.yaml).
 
 After you clone the repository, you will need to install initialize pre-commit hook.
 
diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
deleted file mode 100644
index 640dc4d0ad..0000000000
--- a/.github/workflows/build.yml
+++ /dev/null
@@ -1,260 +0,0 @@
-name: build
-
-on:
-  push:
-    paths-ignore:
-      - 'demo/**'
-      - '.dev/**'
-      - 'docker/**'
-      - 'tools/**'
-      - '**.md'
-
-  pull_request:
-    paths-ignore:
-      - 'demo/**'
-      - '.dev/**'
-      - 'docker/**'
-      - 'tools/**'
-      - 'docs/**'
-      - '**.md'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  build_cpu:
-    runs-on: ubuntu-18.04
-    strategy:
-      matrix:
-        python-version: [3.7]
-        torch: [1.5.1, 1.6.0, 1.7.0, 1.8.0, 1.9.0]
-        include:
-          - torch: 1.5.1
-            torch_version: torch1.5
-            torchvision: 0.6.1
-          - torch: 1.6.0
-            torch_version: torch1.6
-            torchvision: 0.7.0
-          - torch: 1.7.0
-            torch_version: torch1.7
-            torchvision: 0.8.1
-          - torch: 1.8.0
-            torch_version: torch1.8
-            torchvision: 0.9.0
-          - torch: 1.9.0
-            torch_version: torch1.9
-            torchvision: 0.10.0
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Upgrade pip
-        run: pip install pip --upgrade
-      - name: Install Pillow
-        run: pip install Pillow==6.2.2
-        if: ${{matrix.torchvision == '0.4.2'}}
-      - name: Install PyTorch
-        run: pip install torch==${{matrix.torch}}+cpu torchvision==${{matrix.torchvision}}+cpu -f https://download.pytorch.org/whl/torch_stable.html
-      - name: Install MMCV
-        run: |
-          pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/${{matrix.torch_version}}/index.html
-          python -c 'import mmcv; print(mmcv.__version__)'
-      - name: Install unittest dependencies
-        run: |
-          pip install -r requirements.txt
-      - name: Build and install
-        run: rm -rf .eggs && pip install -e .
-      - name: Run unittests and generate coverage report
-        run: |
-          pip install timm
-          coverage run --branch --source mmseg -m pytest tests/
-          coverage xml
-          coverage report -m
-        if: ${{matrix.torch >= '1.5.0'}}
-      - name: Skip timm unittests and generate coverage report
-        run: |
-          coverage run --branch --source mmseg -m pytest tests/ --ignore tests/test_models/test_backbones/test_timm_backbone.py
-          coverage xml
-          coverage report -m
-        if: ${{matrix.torch < '1.5.0'}}
-
-  build_cuda101:
-    runs-on: ubuntu-18.04
-    container:
-      image: pytorch/pytorch:1.6.0-cuda10.1-cudnn7-devel
-
-    strategy:
-      matrix:
-        python-version: [3.7]
-        torch:
-          [
-            1.5.1+cu101,
-            1.6.0+cu101,
-            1.7.0+cu101,
-            1.8.0+cu101
-          ]
-        include:
-          - torch: 1.5.1+cu101
-            torch_version: torch1.5
-            torchvision: 0.6.1+cu101
-          - torch: 1.6.0+cu101
-            torch_version: torch1.6
-            torchvision: 0.7.0+cu101
-          - torch: 1.7.0+cu101
-            torch_version: torch1.7
-            torchvision: 0.8.1+cu101
-          - torch: 1.8.0+cu101
-            torch_version: torch1.8
-            torchvision: 0.9.0+cu101
-
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Fetch GPG keys
-        run: |
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
-      - name: Install system dependencies
-        run: |
-          apt-get update && apt-get install -y libgl1-mesa-glx ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 python${{matrix.python-version}}-dev
-          apt-get clean
-          rm -rf /var/lib/apt/lists/*
-      - name: Install Pillow
-        run: python -m pip install Pillow==6.2.2
-        if: ${{matrix.torchvision < 0.5}}
-      - name: Install PyTorch
-        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
-      - name: Install mmseg dependencies
-        run: |
-          python -V
-          python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu101/${{matrix.torch_version}}/index.html
-          python -m pip install -r requirements.txt
-          python -c 'import mmcv; print(mmcv.__version__)'
-      - name: Build and install
-        run: |
-          rm -rf .eggs
-          python setup.py check -m -s
-          TORCH_CUDA_ARCH_LIST=7.0 pip install .
-      - name: Run unittests and generate coverage report
-        run: |
-          python -m pip install timm
-          coverage run --branch --source mmseg -m pytest tests/
-          coverage xml
-          coverage report -m
-        if: ${{matrix.torch >= '1.5.0'}}
-      - name: Skip timm unittests and generate coverage report
-        run: |
-          coverage run --branch --source mmseg -m pytest tests/ --ignore tests/test_models/test_backbones/test_timm_backbone.py
-          coverage xml
-          coverage report -m
-        if: ${{matrix.torch < '1.5.0'}}
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v1.0.10
-        with:
-          file: ./coverage.xml
-          flags: unittests
-          env_vars: OS,PYTHON
-          name: codecov-umbrella
-          fail_ci_if_error: false
-
-  build_cuda102:
-    runs-on: ubuntu-18.04
-    container:
-      image: pytorch/pytorch:1.9.0-cuda10.2-cudnn7-devel
-
-    strategy:
-      matrix:
-        python-version: [3.6, 3.7, 3.8, 3.9]
-        torch: [1.9.0+cu102]
-        include:
-          - torch: 1.9.0+cu102
-            torch_version: torch1.9
-            torchvision: 0.10.0+cu102
-
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python-version }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python-version }}
-      - name: Fetch GPG keys
-        run: |
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
-          apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
-      - name: Install system dependencies
-        run: |
-          apt-get update && apt-get install -y libgl1-mesa-glx ffmpeg libsm6 libxext6 git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6
-          apt-get clean
-          rm -rf /var/lib/apt/lists/*
-      - name: Install Pillow
-        run: python -m pip install Pillow==6.2.2
-        if: ${{matrix.torchvision < 0.5}}
-      - name: Install PyTorch
-        run: python -m pip install torch==${{matrix.torch}} torchvision==${{matrix.torchvision}} -f https://download.pytorch.org/whl/torch_stable.html
-      - name: Install mmseg dependencies
-        run: |
-          python -V
-          python -m pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu102/${{matrix.torch_version}}/index.html
-          python -m pip install -r requirements.txt
-          python -c 'import mmcv; print(mmcv.__version__)'
-      - name: Build and install
-        run: |
-          rm -rf .eggs
-          python setup.py check -m -s
-          TORCH_CUDA_ARCH_LIST=7.0 pip install .
-      - name: Run unittests and generate coverage report
-        run: |
-          python -m pip install timm
-          coverage run --branch --source mmseg -m pytest tests/
-          coverage xml
-          coverage report -m
-      - name: Upload coverage to Codecov
-        uses: codecov/codecov-action@v2
-        with:
-          files: ./coverage.xml
-          flags: unittests
-          env_vars: OS,PYTHON
-          name: codecov-umbrella
-          fail_ci_if_error: false
-
-  test_windows:
-    runs-on: ${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [windows-2022]
-        python: [3.8]
-        platform: [cpu, cu111]
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python ${{ matrix.python }}
-        uses: actions/setup-python@v2
-        with:
-          python-version: ${{ matrix.python }}
-      - name: Upgrade pip
-        run: pip install pip --upgrade --user
-      - name: Install OpenCV
-        run: pip install opencv-python>=3
-      - name: Install PyTorch
-        # As a complement to Linux CI, we test on PyTorch LTS version
-        run: pip install torch==1.8.2+${{ matrix.platform }} torchvision==0.9.2+${{ matrix.platform }} -f https://download.pytorch.org/whl/lts/1.8/torch_lts.html
-      - name: Install MMCV
-        run: |
-          pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cpu/torch1.8/index.html --only-binary mmcv-full
-      - name: Install unittest dependencies
-        run: pip install -r requirements/tests.txt -r requirements/optional.txt
-      - name: Build and install
-        run: pip install -e .
-      - name: Run unittests
-        run: |
-          python -m pip install timm
-          coverage run --branch --source mmseg -m pytest tests/
-      - name: Generate coverage report
-        run: |
-          coverage xml
-          coverage report -m
diff --git a/.github/workflows/deploy.yml b/.github/workflows/deploy.yml
index ab64085cba..0e0c6c992e 100644
--- a/.github/workflows/deploy.yml
+++ b/.github/workflows/deploy.yml
@@ -8,12 +8,12 @@ concurrency:
 
 jobs:
   build-n-publish:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-22.04
     if: startsWith(github.event.ref, 'refs/tags')
     steps:
-      - uses: actions/checkout@v2
+      - uses: actions/checkout@v3
       - name: Set up Python 3.7
-        uses: actions/setup-python@v2
+        uses: actions/setup-python@v4
         with:
           python-version: 3.7
       - name: Build MMSegmentation
diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
deleted file mode 100644
index 7f7a309280..0000000000
--- a/.github/workflows/lint.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: lint
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  lint:
-    runs-on: ubuntu-18.04
-    steps:
-      - uses: actions/checkout@v2
-      - name: Set up Python 3.7
-        uses: actions/setup-python@v2
-        with:
-          python-version: 3.7
-      - name: Install pre-commit hook
-        run: |
-          pip install pre-commit
-          pre-commit install
-      - name: Linting
-        run: |
-          sudo apt-add-repository ppa:brightbox/ruby-ng -y
-          sudo apt-get update
-          sudo apt-get install -y ruby2.7
-          pre-commit run --all-files
-      - name: Check docstring coverage
-        run: |
-          pip install interrogate
-          interrogate -v --ignore-init-method --ignore-module --ignore-nested-functions --exclude mmseg/ops --ignore-regex "__repr__" --fail-under 80 mmseg
diff --git a/.gitignore b/.gitignore
index f5841a1be0..787d13ec67 100644
--- a/.gitignore
+++ b/.gitignore
@@ -105,6 +105,7 @@ venv.bak/
 # mypy
 .mypy_cache/
 
+data
 .vscode
 .idea
 
diff --git a/.owners.yml b/.owners.yml
index b850b09507..20f2070d40 100644
--- a/.owners.yml
+++ b/.owners.yml
@@ -1,10 +1,7 @@
 assign:
   strategy:
     # random
-    # daily-shift-based
-    round-robin
+    # round-robin
+    daily-shift-based
   assignees:
-    - MeowZheng
-    - MengzhangLI
-    - linfangjian01
-    - xiaoachen98
+    - xiexinch
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 884f5cd1e9..aa5942748a 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,18 +1,18 @@
 repos:
-  - repo: https://gitlab.com/pycqa/flake8.git
-    rev: 3.8.3
+  - repo: https://github.com/PyCQA/flake8
+    rev: 5.0.4
     hooks:
       - id: flake8
-  - repo: https://github.com/PyCQA/isort
-    rev: 5.10.1
+  - repo: https://github.com/zhouzaida/isort
+    rev: 5.12.1
     hooks:
       - id: isort
   - repo: https://github.com/pre-commit/mirrors-yapf
-    rev: v0.30.0
+    rev: v0.32.0
     hooks:
       - id: yapf
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v3.1.0
+    rev: v4.3.0
     hooks:
       - id: trailing-whitespace
       - id: check-yaml
@@ -34,7 +34,7 @@ repos:
           - mdformat_frontmatter
           - linkify-it-py
   - repo: https://github.com/codespell-project/codespell
-    rev: v2.1.0
+    rev: v2.2.1
     hooks:
       - id: codespell
   - repo: https://github.com/myint/docformatter
@@ -42,16 +42,22 @@ repos:
     hooks:
       - id: docformatter
         args: ["--in-place", "--wrap-descriptions", "79"]
-  - repo: local
-    hooks:
-      - id: update-model-index
-        name: update-model-index
-        description: Collect model information and update model-index.yml
-        entry: .dev/md2yml.py
-        additional_dependencies: [mmcv, lxml, opencv-python]
-        language: python
-        files: ^configs/.*\.md$
-        require_serial: true
+  # temporarily remove update-model-index to avoid conflict raised
+  # by depth estimator models
+  # - repo: local
+  #   hooks:
+  #     - id: update-model-index
+  #       name: update-model-index
+  #       description: Collect model information and update model-index.yml
+  #       entry: .dev_scripts/update_model_index.py
+  #       additional_dependencies: [pyyaml]
+  #       language: python
+  #       require_serial: true
+  - repo: https://github.com/asottile/pyupgrade
+    rev: v3.0.0
+    hooks:
+      - id: pyupgrade
+        args: ["--py36-plus"]
   - repo: https://github.com/open-mmlab/pre-commit-hooks
     rev: v0.2.0  # Use the rev to fix revision
     hooks:
diff --git a/.readthedocs.yml b/.readthedocs.yml
index 6cfbf5d310..65a65ba1ba 100644
--- a/.readthedocs.yml
+++ b/.readthedocs.yml
@@ -1,9 +1,14 @@
 version: 2
 
-formats: all
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.7"
+
+formats:
+    - epub
 
 python:
-  version: 3.7
   install:
     - requirements: requirements/docs.txt
     - requirements: requirements/readthedocs.txt
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000..4feb387c61
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,35 @@
+ARG PYTORCH="1.11.0"
+ARG CUDA="11.3"
+ARG CUDNN="8"
+ARG MMCV="2.0.1"
+
+FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
+
+ENV TORCH_CUDA_ARCH_LIST="6.0 6.1 7.0+PTX"
+ENV TORCH_NVCC_FLAGS="-Xfatbin -compress-all"
+ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
+
+# To fix GPG key error when running apt-get update
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
+
+RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-dev  \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN conda clean --all
+
+# Install MMCV
+ARG PYTORCH
+ARG CUDA
+ARG MMCV
+RUN ["/bin/bash", "-c", "pip install openmim"]
+RUN ["/bin/bash", "-c", "mim install mmengine"]
+RUN ["/bin/bash", "-c", "mim install mmcv==${MMCV}"]
+
+# Install MMSegmentation
+RUN git clone -b main https://github.com/zivdar001matin/zero_mould_mmsegmentation.git /mmsegmentation
+WORKDIR /mmsegmentation
+ENV FORCE_CUDA="1"
+RUN pip install -r requirements.txt
+RUN pip install --no-cache-dir -e .
\ No newline at end of file
diff --git a/MANIFEST.in b/MANIFEST.in
index e307d81817..94a0fc1c02 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,4 +1,5 @@
 include requirements/*.txt
 include mmseg/.mim/model-index.yml
-recursive-include mmseg/.mim/configs *.py *.yml
+include mmseg/utils/bpe_simple_vocab_16e6.txt.gz
+recursive-include mmseg/.mim/configs *.py *.yaml
 recursive-include mmseg/.mim/tools *.py *.sh
diff --git a/README.md b/README.md
index ba1d3a4445..5d63f49279 100644
--- a/README.md
+++ b/README.md
@@ -17,28 +17,55 @@
     </sup>
   </div>
   <div>&nbsp;</div>
-</div>
-<br />
 
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmsegmentation)](https://pypi.org/project/mmsegmentation/)
 [![PyPI](https://img.shields.io/pypi/v/mmsegmentation)](https://pypi.org/project/mmsegmentation)
 [![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmsegmentation.readthedocs.io/en/latest/)
 [![badge](https://github.com/open-mmlab/mmsegmentation/workflows/build/badge.svg)](https://github.com/open-mmlab/mmsegmentation/actions)
 [![codecov](https://codecov.io/gh/open-mmlab/mmsegmentation/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmsegmentation)
-[![license](https://img.shields.io/github/license/open-mmlab/mmsegmentation.svg)](https://github.com/open-mmlab/mmsegmentation/blob/master/LICENSE)
+[![license](https://img.shields.io/github/license/open-mmlab/mmsegmentation.svg)](https://github.com/open-mmlab/mmsegmentation/blob/main/LICENSE)
 [![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmsegmentation.svg)](https://github.com/open-mmlab/mmsegmentation/issues)
 [![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmsegmentation.svg)](https://github.com/open-mmlab/mmsegmentation/issues)
+[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_demo.svg)](https://openxlab.org.cn/apps?search=mmseg)
 
-Documentation: https://mmsegmentation.readthedocs.io/
+Documentation: <https://mmsegmentation.readthedocs.io/en/latest/>
 
 English | [简体中文](README_zh-CN.md)
 
+</div>
+
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.gg/raweFPmdzG" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
+</div>
+
 ## Introduction
 
 MMSegmentation is an open source semantic segmentation toolbox based on PyTorch.
 It is a part of the OpenMMLab project.
 
-The master branch works with **PyTorch 1.5+**.
+The [main](https://github.com/open-mmlab/mmsegmentation/tree/main) branch works with PyTorch 1.6+.
+
+### 🎉 Introducing MMSegmentation v1.0.0 🎉
+
+We are thrilled to announce the official release of MMSegmentation's latest version! For this new release, the [main](https://github.com/open-mmlab/mmsegmentation/tree/main) branch serves as the primary branch, while the development branch is [dev-1.x](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x). The stable branch for the previous release remains as the [0.x](https://github.com/open-mmlab/mmsegmentation/tree/0.x) branch. Please note that the [master](https://github.com/open-mmlab/mmsegmentation/tree/master) branch will only be maintained for a limited time before being removed. We encourage you to be mindful of branch selection and updates during use. Thank you for your unwavering support and enthusiasm, and let's work together to make MMSegmentation even more robust and powerful! 💪
+
+MMSegmentation v1.x brings remarkable improvements over the 0.x release, offering a more flexible and feature-packed experience. To utilize the new features in v1.x, we kindly invite you to consult our detailed [📚 migration guide](https://mmsegmentation.readthedocs.io/en/latest/migration/interface.html), which will help you seamlessly transition your projects. Your support is invaluable, and we eagerly await your feedback!
 
 ![demo image](resources/seg_demo.gif)
 
@@ -60,20 +87,96 @@ The master branch works with **PyTorch 1.5+**.
 
   The training speed is faster than or comparable to other codebases.
 
-## License
+## What's New
 
-This project is released under the [Apache 2.0 license](LICENSE).
+v1.2.0 was released on 10/12/2023, from 1.1.0 to 1.2.0, we have added or updated the following features:
+
+### Highlights
+
+- Support for the open-vocabulary semantic segmentation algorithm [SAN](configs/san/README.md)
+
+- Support monocular depth estimation task, please refer to [VPD](configs/vpd/README.md) and [Adabins](projects/Adabins/README.md) for more details.
+
+  ![depth estimation](https://github.com/open-mmlab/mmsegmentation/assets/15952744/07afd0e9-8ace-4a00-aa1e-5bf0ca92dcbc)
+
+- Add new projects: open-vocabulary semantic segmentation algorithm [CAT-Seg](projects/CAT-Seg/README.md), real-time semantic segmentation algofithm [PP-MobileSeg](projects/pp_mobileseg/README.md)
+
+## Installation
+
+Please refer to [get_started.md](docs/en/get_started.md#installation) for installation and [dataset_prepare.md](docs/en/user_guides/2_dataset_prepare.md#prepare-datasets) for dataset preparation.
+
+## Get Started
+
+Please see [Overview](docs/en/overview.md) for the general introduction of MMSegmentation.
+
+Please see [user guides](https://mmsegmentation.readthedocs.io/en/latest/user_guides/index.html#) for the basic usage of MMSegmentation.
+There are also [advanced tutorials](https://mmsegmentation.readthedocs.io/en/latest/advanced_guides/index.html) for in-depth understanding of mmseg design and implementation .
+
+A Colab tutorial is also provided. You may preview the notebook [here](demo/MMSegmentation_Tutorial.ipynb) or directly [run](https://colab.research.google.com/github/open-mmlab/mmsegmentation/blob/main/demo/MMSegmentation_Tutorial.ipynb) on Colab.
+
+To migrate from MMSegmentation 0.x, please refer to [migration](docs/en/migration).
+
+## Tutorial
+
+<details>
+<summary>Get Started</summary>
+
+- [MMSeg overview](docs/en/overview.md)
+- [MMSeg Installation](docs/en/get_started.md)
+- [FAQ](docs/en/notes/faq.md)
+
+</details>
+
+<details>
+<summary>MMSeg Basic Tutorial</summary>
+
+- [Tutorial 1: Learn about Configs](docs/en/user_guides/1_config.md)
+- [Tutorial 2: Prepare datasets](docs/en/user_guides/2_dataset_prepare.md)
+- [Tutorial 3: Inference with existing models](docs/en/user_guides/3_inference.md)
+- [Tutorial 4: Train and test with existing models](docs/en/user_guides/4_train_test.md)
+- [Tutorial 5: Model deployment](docs/en/user_guides/5_deployment.md)
+- [Deploy mmsegmentation on Jetson platform](docs/zh_cn/user_guides/deploy_jetson.md)
+- [Useful Tools](docs/en/user_guides/useful_tools.md)
+- [Feature Map Visualization](docs/en/user_guides/visualization_feature_map.md)
+- [Visualization](docs/en/user_guides/visualization.md)
+
+</details>
+
+<details>
+<summary>MMSeg Detail Tutorial</summary>
 
-## Changelog
+- [MMSeg Dataset](docs/en/advanced_guides/datasets.md)
+- [MMSeg Models](docs/en/advanced_guides/models.md)
+- [MMSeg Dataset Structures](docs/en/advanced_guides/structures.md)
+- [MMSeg Data Transforms](docs/en/advanced_guides/transforms.md)
+- [MMSeg Dataflow](docs/en/advanced_guides/data_flow.md)
+- [MMSeg Training Engine](docs/en/advanced_guides/engine.md)
+- [MMSeg Evaluation](docs/en/advanced_guides/evaluation.md)
 
-v0.24.1 was released in 5/1/2022.
-Please refer to [changelog.md](docs/en/changelog.md) for details and release history.
+</details>
+
+<details>
+<summary>MMSeg Development Tutorial</summary>
+
+- [Add New Datasets](docs/en/advanced_guides/add_datasets.md)
+- [Add New Metrics](docs/en/advanced_guides/add_metrics.md)
+- [Add New Modules](docs/en/advanced_guides/add_models.md)
+- [Add New Data Transforms](docs/en/advanced_guides/add_transforms.md)
+- [Customize Runtime Settings](docs/en/advanced_guides/customize_runtime.md)
+- [Training Tricks](docs/en/advanced_guides/training_tricks.md)
+- [Contribute code to MMSeg](.github/CONTRIBUTING.md)
+- [Contribute a standard dataset in projects](docs/zh_cn/advanced_guides/contribute_dataset.md)
+- [NPU (HUAWEI Ascend)](docs/en/device/npu.md)
+- [0.x → 1.x migration](docs/en/migration/interface.md)，[0.x → 1.x package](docs/en/migration/package.md)
+
+</details>
 
 ## Benchmark and model zoo
 
 Results and models are available in the [model zoo](docs/en/model_zoo.md).
 
-Supported backbones:
+<details open>
+<summary>Supported backbones:</summary>
 
 - [x] ResNet (CVPR'2016)
 - [x] ResNeXt (CVPR'2017)
@@ -87,76 +190,99 @@ Supported backbones:
 - [x] [BEiT (ICLR'2022)](configs/beit)
 - [x] [ConvNeXt (CVPR'2022)](configs/convnext)
 - [x] [MAE (CVPR'2022)](configs/mae)
+- [x] [PoolFormer (CVPR'2022)](configs/poolformer)
+- [x] [SegNeXt (NeurIPS'2022)](configs/segnext)
 
-Supported methods:
+</details>
 
-- [x] [FCN (CVPR'2015/TPAMI'2017)](configs/fcn)
-- [x] [ERFNet (T-ITS'2017)](configs/erfnet)
-- [x] [UNet (MICCAI'2016/Nat. Methods'2019)](configs/unet)
-- [x] [PSPNet (CVPR'2017)](configs/pspnet)
-- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
-- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
-- [x] [PSANet (ECCV'2018)](configs/psanet)
-- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
-- [x] [UPerNet (ECCV'2018)](configs/upernet)
-- [x] [ICNet (ECCV'2018)](configs/icnet)
-- [x] [NonLocal Net (CVPR'2018)](configs/nonlocal_net)
-- [x] [EncNet (CVPR'2018)](configs/encnet)
-- [x] [Semantic FPN (CVPR'2019)](configs/sem_fpn)
-- [x] [DANet (CVPR'2019)](configs/danet)
-- [x] [APCNet (CVPR'2019)](configs/apcnet)
+<details open>
+<summary>Supported methods:</summary>
+
+- [x] [SAN (CVPR'2023)](configs/san/)
+- [x] [VPD (ICCV'2023)](configs/vpd)
+- [x] [DDRNet (T-ITS'2022)](configs/ddrnet)
+- [x] [PIDNet (ArXiv'2022)](configs/pidnet)
+- [x] [Mask2Former (CVPR'2022)](configs/mask2former)
+- [x] [MaskFormer (NeurIPS'2021)](configs/maskformer)
+- [x] [K-Net (NeurIPS'2021)](configs/knet)
+- [x] [SegFormer (NeurIPS'2021)](configs/segformer)
+- [x] [Segmenter (ICCV'2021)](configs/segmenter)
+- [x] [DPT (ArXiv'2021)](configs/dpt)
+- [x] [SETR (CVPR'2021)](configs/setr)
+- [x] [STDC (CVPR'2021)](configs/stdc)
+- [x] [BiSeNetV2 (IJCV'2021)](configs/bisenetv2)
+- [x] [CGNet (TIP'2020)](configs/cgnet)
+- [x] [PointRend (CVPR'2020)](configs/point_rend)
+- [x] [DNLNet (ECCV'2020)](configs/dnlnet)
+- [x] [OCRNet (ECCV'2020)](configs/ocrnet)
+- [x] [ISANet (ArXiv'2019/IJCV'2021)](configs/isanet)
+- [x] [Fast-SCNN (ArXiv'2019)](configs/fastscnn)
+- [x] [FastFCN (ArXiv'2019)](configs/fastfcn)
+- [x] [GCNet (ICCVW'2019/TPAMI'2020)](configs/gcnet)
+- [x] [ANN (ICCV'2019)](configs/ann)
 - [x] [EMANet (ICCV'2019)](configs/emanet)
 - [x] [CCNet (ICCV'2019)](configs/ccnet)
 - [x] [DMNet (ICCV'2019)](configs/dmnet)
-- [x] [ANN (ICCV'2019)](configs/ann)
-- [x] [GCNet (ICCVW'2019/TPAMI'2020)](configs/gcnet)
-- [x] [FastFCN (ArXiv'2019)](configs/fastfcn)
-- [x] [Fast-SCNN (ArXiv'2019)](configs/fastscnn)
-- [x] [ISANet (ArXiv'2019/IJCV'2021)](configs/isanet)
-- [x] [OCRNet (ECCV'2020)](configs/ocrnet)
-- [x] [DNLNet (ECCV'2020)](configs/dnlnet)
-- [x] [PointRend (CVPR'2020)](configs/point_rend)
-- [x] [CGNet (TIP'2020)](configs/cgnet)
-- [x] [BiSeNetV2 (IJCV'2021)](configs/bisenetv2)
-- [x] [STDC (CVPR'2021)](configs/stdc)
-- [x] [SETR (CVPR'2021)](configs/setr)
-- [x] [DPT (ArXiv'2021)](configs/dpt)
-- [x] [Segmenter (ICCV'2021)](configs/segmenter)
-- [x] [SegFormer (NeurIPS'2021)](configs/segformer)
-- [x] [K-Net (NeurIPS'2021)](configs/knet)
+- [x] [Semantic FPN (CVPR'2019)](configs/sem_fpn)
+- [x] [DANet (CVPR'2019)](configs/danet)
+- [x] [APCNet (CVPR'2019)](configs/apcnet)
+- [x] [NonLocal Net (CVPR'2018)](configs/nonlocal_net)
+- [x] [EncNet (CVPR'2018)](configs/encnet)
+- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
+- [x] [UPerNet (ECCV'2018)](configs/upernet)
+- [x] [ICNet (ECCV'2018)](configs/icnet)
+- [x] [PSANet (ECCV'2018)](configs/psanet)
+- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
+- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
+- [x] [PSPNet (CVPR'2017)](configs/pspnet)
+- [x] [ERFNet (T-ITS'2017)](configs/erfnet)
+- [x] [UNet (MICCAI'2016/Nat. Methods'2019)](configs/unet)
+- [x] [FCN (CVPR'2015/TPAMI'2017)](configs/fcn)
 
-Supported datasets:
-
-- [x] [Cityscapes](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#cityscapes)
-- [x] [PASCAL VOC](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#pascal-voc)
-- [x] [ADE20K](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#ade20k)
-- [x] [Pascal Context](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#pascal-context)
-- [x] [COCO-Stuff 10k](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#coco-stuff-10k)
-- [x] [COCO-Stuff 164k](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#coco-stuff-164k)
-- [x] [CHASE_DB1](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#chase-db1)
-- [x] [DRIVE](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#drive)
-- [x] [HRF](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#hrf)
-- [x] [STARE](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#stare)
-- [x] [Dark Zurich](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#dark-zurich)
-- [x] [Nighttime Driving](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#nighttime-driving)
-- [x] [LoveDA](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#loveda)
-- [x] [Potsdam](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#isprs-potsdam)
-- [x] [Vaihingen](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#isprs-vaihingen)
-- [x] [iSAID](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#isaid)
+</details>
 
-## Installation
+<details open>
+<summary>Supported datasets:</summary>
 
-Please refer to [get_started.md](docs/en/get_started.md#installation) for installation and [dataset_prepare.md](docs/en/dataset_prepare.md#prepare-datasets) for dataset preparation.
+- [x] [Cityscapes](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#cityscapes)
+- [x] [PASCAL VOC](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#pascal-voc)
+- [x] [ADE20K](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#ade20k)
+- [x] [Pascal Context](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#pascal-context)
+- [x] [COCO-Stuff 10k](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#coco-stuff-10k)
+- [x] [COCO-Stuff 164k](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#coco-stuff-164k)
+- [x] [CHASE_DB1](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#chase-db1)
+- [x] [DRIVE](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#drive)
+- [x] [HRF](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#hrf)
+- [x] [STARE](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#stare)
+- [x] [Dark Zurich](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#dark-zurich)
+- [x] [Nighttime Driving](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#nighttime-driving)
+- [x] [LoveDA](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#loveda)
+- [x] [Potsdam](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#isprs-potsdam)
+- [x] [Vaihingen](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#isprs-vaihingen)
+- [x] [iSAID](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#isaid)
+- [x] [Mapillary Vistas](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#mapillary-vistas-datasets)
+- [x] [LEVIR-CD](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#levir-cd)
+- [x] [BDD100K](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#bdd100K)
+- [x] [NYU](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#nyu)
 
-## Get Started
+</details>
+
+Please refer to [FAQ](docs/en/notes/faq.md) for frequently asked questions.
+
+## Projects
+
+[Here](projects/README.md) are some implementations of SOTA models and solutions built on MMSegmentation, which are supported and maintained by community users. These projects demonstrate the best practices based on MMSegmentation for research and product development. We welcome and appreciate all the contributions to OpenMMLab ecosystem.
+
+## Contributing
 
-Please see [train.md](docs/en/train.md) and [inference.md](docs/en/inference.md) for the basic usage of MMSegmentation.
-There are also tutorials for [customizing dataset](docs/en/tutorials/customize_datasets.md), [designing data pipeline](docs/en/tutorials/data_pipeline.md), [customizing modules](docs/en/tutorials/customize_models.md), and [customizing runtime](docs/en/tutorials/customize_runtime.md).
-We also provide many [training tricks](docs/en/tutorials/training_tricks.md) for better training and [useful tools](docs/en/useful_tools.md) for deployment.
+We appreciate all contributions to improve MMSegmentation. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
 
-A Colab tutorial is also provided. You may preview the notebook [here](demo/MMSegmentation_Tutorial.ipynb) or directly [run](https://colab.research.google.com/github/open-mmlab/mmsegmentation/blob/master/demo/MMSegmentation_Tutorial.ipynb) on Colab.
+## Acknowledgement
 
-Please refer to [FAQ](docs/en/faq.md) for frequently asked questions.
+MMSegmentation is an open source project that welcome any contribution and feedback.
+We wish that the toolbox and benchmark could serve the growing research
+community by providing a flexible as well as standardized toolkit to reimplement existing methods
+and develop their own new semantic segmentation methods.
 
 ## Citation
 
@@ -171,35 +297,29 @@ If you find this project useful in your research, please consider cite:
 }
 ```
 
-## Contributing
-
-We appreciate all contributions to improve MMSegmentation. Please refer to [CONTRIBUTING.md](.github/CONTRIBUTING.md) for the contributing guideline.
-
-## Acknowledgement
+## License
 
-MMSegmentation is an open source project that welcome any contribution and feedback.
-We wish that the toolbox and benchmark could serve the growing research
-community by providing a flexible as well as standardized toolkit to reimplement existing methods
-and develop their own new semantic segmentation methods.
+This project is released under the [Apache 2.0 license](LICENSE).
 
-## Projects in OpenMMLab
+## OpenMMLab Family
 
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab foundational library for training deep learning models.
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab foundational library for computer vision.
-- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab image classification toolbox and benchmark.
+- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab pre-training toolbox and benchmark.
+- [MMagic](https://github.com/open-mmlab/mmagic): Open**MM**Lab **A**dvanced, **G**enerative and **I**ntelligent **C**reation toolbox.
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab detection toolbox and benchmark.
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO series toolbox and benchmark.
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab's next-generation platform for general 3D object detection.
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab rotated object detection toolbox and benchmark.
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab semantic segmentation toolbox and benchmark.
 - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab text detection, recognition, and understanding toolbox.
 - [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab pose estimation toolbox and benchmark.
 - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 3D human parametric model toolbox and benchmark.
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab self-supervised learning toolbox and benchmark.
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
 - [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab fewshot learning toolbox and benchmark.
 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab's next-generation action understanding toolbox and benchmark.
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab video perception toolbox and benchmark.
 - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab optical flow toolbox and benchmark.
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab image and video editing toolbox.
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab image and video generative models toolbox.
 - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab Model Deployment Framework.
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab model compression toolbox and benchmark.
+- [MIM](https://github.com/open-mmlab/mim): MIM installs OpenMMLab packages.
+- [Playground](https://github.com/open-mmlab/playground): A central hub for gathering and showcasing amazing projects built upon OpenMMLab.
diff --git a/README_zh-CN.md b/README_zh-CN.md
index 9ba0969aba..fa4c82a280 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -17,27 +17,54 @@
     </sup>
   </div>
   <div>&nbsp;</div>
-</div>
-<br />
 
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/mmsegmentation)](https://pypi.org/project/mmsegmentation/)
 [![PyPI](https://img.shields.io/pypi/v/mmsegmentation)](https://pypi.org/project/mmsegmentation)
 [![docs](https://img.shields.io/badge/docs-latest-blue)](https://mmsegmentation.readthedocs.io/zh_CN/latest/)
 [![badge](https://github.com/open-mmlab/mmsegmentation/workflows/build/badge.svg)](https://github.com/open-mmlab/mmsegmentation/actions)
 [![codecov](https://codecov.io/gh/open-mmlab/mmsegmentation/branch/master/graph/badge.svg)](https://codecov.io/gh/open-mmlab/mmsegmentation)
-[![license](https://img.shields.io/github/license/open-mmlab/mmsegmentation.svg)](https://github.com/open-mmlab/mmsegmentation/blob/master/LICENSE)
+[![license](https://img.shields.io/github/license/open-mmlab/mmsegmentation.svg)](https://github.com/open-mmlab/mmsegmentation/blob/main/LICENSE)
 [![issue resolution](https://isitmaintained.com/badge/resolution/open-mmlab/mmsegmentation.svg)](https://github.com/open-mmlab/mmsegmentation/issues)
 [![open issues](https://isitmaintained.com/badge/open/open-mmlab/mmsegmentation.svg)](https://github.com/open-mmlab/mmsegmentation/issues)
+[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_demo.svg)](https://openxlab.org.cn/apps?search=mmseg)
 
-文档: https://mmsegmentation.readthedocs.io/zh_CN/latest
+文档: <https://mmsegmentation.readthedocs.io/zh_CN/latest>
 
 [English](README.md) | 简体中文
 
+</div>
+
+<div align="center">
+  <a href="https://openmmlab.medium.com/" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219255827-67c1a27f-f8c5-46a9-811d-5e57448c61d1.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://discord.gg/raweFPmdzG" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218347213-c080267f-cbb6-443e-8532-8e1ed9a58ea9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://twitter.com/OpenMMLab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346637-d30c8a0f-3eba-4699-8131-512fb06d46db.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.youtube.com/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/218346691-ceb2116a-465a-40af-8424-9f30d2348ca9.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://space.bilibili.com/1293512903" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026751-d7d14cce-a7c9-4e82-9942-8375fca65b99.png" width="3%" alt="" /></a>
+  <img src="https://user-images.githubusercontent.com/25839884/218346358-56cc8e2f-a2b8-487f-9088-32480cceabcf.png" width="3%" alt="" />
+  <a href="https://www.zhihu.com/people/openmmlab" style="text-decoration:none;">
+    <img src="https://user-images.githubusercontent.com/25839884/219026120-ba71e48b-6e94-4bd4-b4e9-b7d175b5e362.png" width="3%" alt="" /></a>
+</div>
+
 ## 简介
 
 MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 OpenMMLab 项目的一部分。
 
-主分支代码目前支持 PyTorch 1.5 以上的版本。
+[main](https://github.com/open-mmlab/mmsegmentation/tree/main) 分支代码目前支持 PyTorch 1.6 以上的版本。
+
+### 🎉 MMSegmentation v1.0.0 简介 🎉
+
+我们非常高兴地宣布 MMSegmentation 最新版本的正式发布！在这个新版本中，主要分支是 [main](https://github.com/open-mmlab/mmsegmentation/tree/main) 分支，开发分支是 [dev-1.x](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x)。而之前版本的稳定分支保留为 [0.x](https://github.com/open-mmlab/mmsegmentation/tree/0.x) 分支。请注意，[master](https://github.com/open-mmlab/mmsegmentation/tree/master) 分支将只在有限的时间内维护，然后将被删除。我们鼓励您在使用过程中注意分支选择和更新。感谢您一如既往的支持和热情，让我们共同努力，使 MMSegmentation 变得更加健壮和强大！💪
+
+MMSegmentation v1.x 在 0.x 版本的基础上有了显著的提升，提供了更加灵活和功能丰富的体验。为了更好使用 v1.x 中的新功能，我们诚挚邀请您查阅我们详细的 [📚 迁移指南](https://mmsegmentation.readthedocs.io/zh_CN/latest/migration/interface.html)，以帮助您无缝地过渡您的项目。您的支持对我们来说非常宝贵，我们热切期待您的反馈！
 
 ![示例图片](resources/seg_demo.gif)
 
@@ -59,20 +86,86 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 
   训练速度比其他语义分割代码库更快或者相当。
 
-## 开源许可证
+## 更新日志
 
-该项目采用 [Apache 2.0 开源许可证](LICENSE)。
+最新版本 v1.2.0 在 2023.10.12 发布。
+如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/en/notes/changelog.md)。
 
-## 更新日志
+## 安装
+
+请参考[快速入门文档](docs/zh_cn/get_started.md#installation)进行安装，参考[数据集准备](docs/zh_cn/user_guides/2_dataset_prepare.md)处理数据。
+
+## 快速入门
+
+请参考[概述](docs/zh_cn/overview.md)对 MMSegmetation 进行初步了解
+
+请参考[用户指南](https://mmsegmentation.readthedocs.io/zh_CN/latest/user_guides/index.html)了解 mmseg 的基本使用，以及[进阶指南](https://mmsegmentation.readthedocs.io/zh_CN/latest/advanced_guides/index.html)深入了解 mmseg 设计和代码实现。
+
+同时，我们提供了 Colab 教程。你可以在[这里](demo/MMSegmentation_Tutorial.ipynb)浏览教程，或者直接在 Colab 上[运行](https://colab.research.google.com/github/open-mmlab/mmsegmentation/blob/main/demo/MMSegmentation_Tutorial.ipynb)。
+
+若需要将 0.x 版本的代码迁移至新版，请参考[迁移文档](docs/zh_cn/migration)。
+
+## 教程文档
+
+<details>
+<summary>开启 MMSeg 之旅</summary>
+
+- [MMSeg 概述](docs/zh_cn/overview.md)
+- [安装和验证](docs/zh_cn/get_started.md)
+- [常见问题解答](docs/zh_cn/notes/faq.md)
+
+</details>
+
+<details>
+<summary>MMSeg 快速入门教程</summary>
+
+- [教程1：了解配置文件](docs/zh_cn/user_guides/1_config.md)
+- [教程2：准备数据集](docs/zh_cn/user_guides/2_dataset_prepare.md)
+- [教程3：使用预训练模型推理](docs/zh_cn/user_guides/3_inference.md)
+- [教程4：使用现有模型进行训练和测试](docs/zh_cn/user_guides/4_train_test.md)
+- [教程5：模型部署](docs/zh_cn/user_guides/5_deployment.md)
+- [在 Jetson 平台部署 mmsegmentation](docs/zh_cn/user_guides/deploy_jetson.md)
+- [常用工具](docs/zh_cn/user_guides/useful_tools.md)
+- [特征图可视化](docs/zh_cn/user_guides/visualization_feature_map.md)
+- [可视化](docs/zh_cn/user_guides/visualization.md)
+
+</details>
+
+<details>
+<summary>MMSeg 细节介绍</summary>
 
-最新版本 v0.24.1 在 2022.5.1 发布。
-如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/en/changelog.md)。
+- [MMSeg 数据集介绍](docs/zh_cn/advanced_guides/datasets.md)
+- [MMSeg 模型介绍](docs/zh_cn/advanced_guides/models.md)
+- [MMSeg 数据结构介绍](docs/zh_cn/advanced_guides/structures.md)
+- [MMSeg 数据增强介绍](docs/zh_cn/advanced_guides/transforms.md)
+- [MMSeg 数据流介绍](docs/zh_cn/advanced_guides/data_flow.md)
+- [MMSeg 训练引擎介绍](docs/zh_cn/advanced_guides/engine.md)
+- [MMSeg 模型评测介绍](docs/zh_cn/advanced_guides/evaluation.md)
+
+</details>
+
+<details>
+<summary>MMSeg 开发教程</summary>
+
+- [新增自定义数据集](docs/zh_cn/advanced_guides/add_datasets.md)
+- [新增评测指标](docs/zh_cn/advanced_guides/add_metrics.md)
+- [新增自定义模型](docs/zh_cn/advanced_guides/add_models.md)
+- [新增自定义数据增强](docs/zh_cn/advanced_guides/add_transforms.md)
+- [自定义运行设定](docs/zh_cn/advanced_guides/customize_runtime.md)
+- [训练技巧](docs/zh_cn/advanced_guides/training_tricks.md)
+- [如何给 MMSeg贡献代码](.github/CONTRIBUTING.md)
+- [在 projects 给 MMSeg 贡献一个标准数据集](docs/zh_cn/advanced_guides/contribute_dataset.md)
+- [NPU (华为 昇腾)](docs/zh_cn/device/npu.md)
+- [0.x → 1.x 迁移文档](docs/zh_cn/migration/interface.md)，[0.x → 1.x 库变更文档](docs/zh_cn/migration/package.md)
+
+</details>
 
 ## 基准测试和模型库
 
 测试结果和模型可以在[模型库](docs/zh_cn/model_zoo.md)中找到。
 
-已支持的骨干网络：
+<details open>
+<summary>已支持的骨干网络：</summary>
 
 - [x] ResNet (CVPR'2016)
 - [x] ResNeXt (CVPR'2017)
@@ -86,76 +179,97 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 - [x] [BEiT (ICLR'2022)](configs/beit)
 - [x] [ConvNeXt (CVPR'2022)](configs/convnext)
 - [x] [MAE (CVPR'2022)](configs/mae)
+- [x] [PoolFormer (CVPR'2022)](configs/poolformer)
+- [x] [SegNeXt (NeurIPS'2022)](configs/segnext)
 
-已支持的算法：
+</details>
 
-- [x] [FCN (CVPR'2015/TPAMI'2017)](configs/fcn)
-- [x] [ERFNet (T-ITS'2017)](configs/erfnet)
-- [x] [UNet (MICCAI'2016/Nat. Methods'2019)](configs/unet)
-- [x] [PSPNet (CVPR'2017)](configs/pspnet)
-- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
-- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
-- [x] [PSANet (ECCV'2018)](configs/psanet)
-- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
-- [x] [UPerNet (ECCV'2018)](configs/upernet)
-- [x] [ICNet (ECCV'2018)](configs/icnet)
-- [x] [NonLocal Net (CVPR'2018)](configs/nonlocal_net)
-- [x] [EncNet (CVPR'2018)](configs/encnet)
-- [x] [Semantic FPN (CVPR'2019)](configs/sem_fpn)
-- [x] [DANet (CVPR'2019)](configs/danet)
-- [x] [APCNet (CVPR'2019)](configs/apcnet)
+<details open>
+<summary>已支持的算法：</summary>
+
+- [x] [SAN (CVPR'2023)](configs/san/)
+- [x] [VPD (ICCV'2023)](configs/vpd)
+- [x] [DDRNet (T-ITS'2022)](configs/ddrnet)
+- [x] [PIDNet (ArXiv'2022)](configs/pidnet)
+- [x] [Mask2Former (CVPR'2022)](configs/mask2former)
+- [x] [MaskFormer (NeurIPS'2021)](configs/maskformer)
+- [x] [K-Net (NeurIPS'2021)](configs/knet)
+- [x] [SegFormer (NeurIPS'2021)](configs/segformer)
+- [x] [Segmenter (ICCV'2021)](configs/segmenter)
+- [x] [DPT (ArXiv'2021)](configs/dpt)
+- [x] [SETR (CVPR'2021)](configs/setr)
+- [x] [STDC (CVPR'2021)](configs/stdc)
+- [x] [BiSeNetV2 (IJCV'2021)](configs/bisenetv2)
+- [x] [CGNet (TIP'2020)](configs/cgnet)
+- [x] [PointRend (CVPR'2020)](configs/point_rend)
+- [x] [DNLNet (ECCV'2020)](configs/dnlnet)
+- [x] [OCRNet (ECCV'2020)](configs/ocrnet)
+- [x] [ISANet (ArXiv'2019/IJCV'2021)](configs/isanet)
+- [x] [Fast-SCNN (ArXiv'2019)](configs/fastscnn)
+- [x] [FastFCN (ArXiv'2019)](configs/fastfcn)
+- [x] [GCNet (ICCVW'2019/TPAMI'2020)](configs/gcnet)
+- [x] [ANN (ICCV'2019)](configs/ann)
 - [x] [EMANet (ICCV'2019)](configs/emanet)
 - [x] [CCNet (ICCV'2019)](configs/ccnet)
 - [x] [DMNet (ICCV'2019)](configs/dmnet)
-- [x] [ANN (ICCV'2019)](configs/ann)
-- [x] [GCNet (ICCVW'2019/TPAMI'2020)](configs/gcnet)
-- [x] [FastFCN (ArXiv'2019)](configs/fastfcn)
-- [x] [Fast-SCNN (ArXiv'2019)](configs/fastscnn)
-- [x] [ISANet (ArXiv'2019/IJCV'2021)](configs/isanet)
-- [x] [OCRNet (ECCV'2020)](configs/ocrnet)
-- [x] [DNLNet (ECCV'2020)](configs/dnlnet)
-- [x] [PointRend (CVPR'2020)](configs/point_rend)
-- [x] [CGNet (TIP'2020)](configs/cgnet)
-- [x] [BiSeNetV2 (IJCV'2021)](configs/bisenetv2)
-- [x] [STDC (CVPR'2021)](configs/stdc)
-- [x] [SETR (CVPR'2021)](configs/setr)
-- [x] [DPT (ArXiv'2021)](configs/dpt)
-- [x] [Segmenter (ICCV'2021)](configs/segmenter)
-- [x] [SegFormer (NeurIPS'2021)](configs/segformer)
-- [x] [K-Net (NeurIPS'2021)](configs/knet)
-
-已支持的数据集：
-
-- [x] [Cityscapes](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#cityscapes)
-- [x] [PASCAL VOC](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#pascal-voc)
-- [x] [ADE20K](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#ade20k)
-- [x] [Pascal Context](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#pascal-context)
-- [x] [COCO-Stuff 10k](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#coco-stuff-10k)
-- [x] [COCO-Stuff 164k](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#coco-stuff-164k)
-- [x] [CHASE_DB1](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#chase-db1)
-- [x] [DRIVE](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#drive)
-- [x] [HRF](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#hrf)
-- [x] [STARE](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#stare)
-- [x] [Dark Zurich](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#dark-zurich)
-- [x] [Nighttime Driving](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#nighttime-driving)
-- [x] [LoveDA](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#loveda)
-- [x] [Potsdam](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#isprs-potsdam)
-- [x] [Vaihingen](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#isprs-vaihingen)
-- [x] [iSAID](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/dataset_prepare.md#isaid)
-
-## 安装
+- [x] [Semantic FPN (CVPR'2019)](configs/sem_fpn)
+- [x] [DANet (CVPR'2019)](configs/danet)
+- [x] [APCNet (CVPR'2019)](configs/apcnet)
+- [x] [NonLocal Net (CVPR'2018)](configs/nonlocal_net)
+- [x] [EncNet (CVPR'2018)](configs/encnet)
+- [x] [DeepLabV3+ (CVPR'2018)](configs/deeplabv3plus)
+- [x] [UPerNet (ECCV'2018)](configs/upernet)
+- [x] [ICNet (ECCV'2018)](configs/icnet)
+- [x] [PSANet (ECCV'2018)](configs/psanet)
+- [x] [BiSeNetV1 (ECCV'2018)](configs/bisenetv1)
+- [x] [DeepLabV3 (ArXiv'2017)](configs/deeplabv3)
+- [x] [PSPNet (CVPR'2017)](configs/pspnet)
+- [x] [ERFNet (T-ITS'2017)](configs/erfnet)
+- [x] [UNet (MICCAI'2016/Nat. Methods'2019)](configs/unet)
+- [x] [FCN (CVPR'2015/TPAMI'2017)](configs/fcn)
 
-请参考[快速入门文档](docs/zh_cn/get_started.md#installation)进行安装，参考[数据集准备](docs/zh_cn/dataset_prepare.md)处理数据。
+</details>
+
+<details open>
+<summary>已支持的数据集：</summary>
+
+- [x] [Cityscapes](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#cityscapes)
+- [x] [PASCAL VOC](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#pascal-voc)
+- [x] [ADE20K](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#ade20k)
+- [x] [Pascal Context](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#pascal-context)
+- [x] [COCO-Stuff 10k](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#coco-stuff-10k)
+- [x] [COCO-Stuff 164k](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#coco-stuff-164k)
+- [x] [CHASE_DB1](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#chase-db1)
+- [x] [DRIVE](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#drive)
+- [x] [HRF](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#hrf)
+- [x] [STARE](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#stare)
+- [x] [Dark Zurich](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#dark-zurich)
+- [x] [Nighttime Driving](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#nighttime-driving)
+- [x] [LoveDA](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#loveda)
+- [x] [Potsdam](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#isprs-potsdam)
+- [x] [Vaihingen](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#isprs-vaihingen)
+- [x] [iSAID](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#isaid)
+- [x] [Mapillary Vistas](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#mapillary-vistas-datasets)
+- [x] [LEVIR-CD](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#levir-cd)
+- [x] [BDD100K](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#bdd100K)
+- [x] [NYU](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md#nyu)
+
+</details>
+
+如果遇到问题，请参考 [常见问题解答](docs/zh_cn/notes/faq.md)。
+
+## 社区项目
+
+[这里](projects/README.md)有一些由社区用户支持和维护的基于 MMSegmentation 的 SOTA 模型和解决方案的实现。这些项目展示了基于 MMSegmentation 的研究和产品开发的最佳实践。
+我们欢迎并感谢对 OpenMMLab 生态系统的所有贡献。
 
-## 快速入门
+## 贡献指南
 
-请参考[训练教程](docs/zh_cn/train.md)和[测试教程](docs/zh_cn/inference.md)学习 MMSegmentation 的基本使用。
-我们也提供了一些进阶教程，内容覆盖了[增加自定义数据集](docs/zh_cn/tutorials/customize_datasets.md)，[设计新的数据预处理流程](docs/zh_cn/tutorials/data_pipeline.md)，[增加自定义模型](docs/zh_cn/tutorials/customize_models.md)，[增加自定义的运行时配置](docs/zh_cn/tutorials/customize_runtime.md)。
-除此之外，我们也提供了很多实用的[训练技巧说明](docs/zh_cn/tutorials/training_tricks.md)和模型部署相关的[有用的工具](docs/zh_cn/useful_tools.md)。
+我们感谢所有的贡献者为改进和提升 MMSegmentation 所作出的努力。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。
 
-同时，我们提供了 Colab 教程。你可以在[这里](demo/MMSegmentation_Tutorial.ipynb)浏览教程，或者直接在 Colab 上[运行](https://colab.research.google.com/github/open-mmlab/mmsegmentation/blob/master/demo/MMSegmentation_Tutorial.ipynb)。
+## 致谢
 
-如果遇到问题，请参考 [常见问题解答](docs/zh_cn/faq.md)。
+MMSegmentation 是一个由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。我们希望这个工具箱和基准测试可以为社区提供灵活的代码工具，供用户复现已有算法并开发自己的新模型，从而不断为开源社区提供贡献。
 
 ## 引用
 
@@ -170,43 +284,40 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 }
 ```
 
-## 贡献指南
-
-我们感谢所有的贡献者为改进和提升 MMSegmentation 所作出的努力。请参考[贡献指南](.github/CONTRIBUTING.md)来了解参与项目贡献的相关指引。
-
-## 致谢
+## 开源许可证
 
-MMSegmentation 是一个由来自不同高校和企业的研发人员共同参与贡献的开源项目。我们感谢所有为项目提供算法复现和新功能支持的贡献者，以及提供宝贵反馈的用户。 我们希望这个工具箱和基准测试可以为社区提供灵活的代码工具，供用户复现已有算法并开发自己的新模型，从而不断为开源社区提供贡献。
+该项目采用 [Apache 2.0 开源许可证](LICENSE)。
 
 ## OpenMMLab 的其他项目
 
+- [MMEngine](https://github.com/open-mmlab/mmengine): OpenMMLab 深度学习模型训练基础库
 - [MMCV](https://github.com/open-mmlab/mmcv): OpenMMLab 计算机视觉基础库
-- [MIM](https://github.com/open-mmlab/mim): MIM 是 OpenMMlab 项目、算法、模型的统一入口
-- [MMClassification](https://github.com/open-mmlab/mmclassification): OpenMMLab 图像分类工具箱
+- [MMPreTrain](https://github.com/open-mmlab/mmpretrain): OpenMMLab 深度学习预训练工具箱
+- [MMagic](https://github.com/open-mmlab/mmagic): OpenMMLab 新一代人工智能内容生成（AIGC）工具箱
 - [MMDetection](https://github.com/open-mmlab/mmdetection): OpenMMLab 目标检测工具箱
+- [MMYOLO](https://github.com/open-mmlab/mmyolo): OpenMMLab YOLO 系列工具箱与测试基准
 - [MMDetection3D](https://github.com/open-mmlab/mmdetection3d): OpenMMLab 新一代通用 3D 目标检测平台
 - [MMRotate](https://github.com/open-mmlab/mmrotate): OpenMMLab 旋转框检测工具箱与测试基准
+- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
 - [MMSegmentation](https://github.com/open-mmlab/mmsegmentation): OpenMMLab 语义分割工具箱
 - [MMOCR](https://github.com/open-mmlab/mmocr): OpenMMLab 全流程文字检测识别理解工具包
 - [MMPose](https://github.com/open-mmlab/mmpose): OpenMMLab 姿态估计工具箱
 - [MMHuman3D](https://github.com/open-mmlab/mmhuman3d): OpenMMLab 人体参数化模型工具箱与测试基准
-- [MMSelfSup](https://github.com/open-mmlab/mmselfsup): OpenMMLab 自监督学习工具箱与测试基准
-- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
 - [MMFewShot](https://github.com/open-mmlab/mmfewshot): OpenMMLab 少样本学习工具箱与测试基准
 - [MMAction2](https://github.com/open-mmlab/mmaction2): OpenMMLab 新一代视频理解工具箱
-- [MMTracking](https://github.com/open-mmlab/mmtracking): OpenMMLab 一体化视频目标感知平台
 - [MMFlow](https://github.com/open-mmlab/mmflow): OpenMMLab 光流估计工具箱与测试基准
-- [MMEditing](https://github.com/open-mmlab/mmediting): OpenMMLab 图像视频编辑工具箱
-- [MMGeneration](https://github.com/open-mmlab/mmgeneration): OpenMMLab 图片视频生成模型工具箱
 - [MMDeploy](https://github.com/open-mmlab/mmdeploy): OpenMMLab 模型部署框架
+- [MMRazor](https://github.com/open-mmlab/mmrazor): OpenMMLab 模型压缩工具箱与测试基准
+- [MIM](https://github.com/open-mmlab/mim): OpenMMLab 项目、算法、模型的统一入口
+- [Playground](https://github.com/open-mmlab/playground): 收集和展示 OpenMMLab 相关的前沿、有趣的社区项目
 
 ## 欢迎加入 OpenMMLab 社区
 
-扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，加入 [OpenMMLab 团队](https://jq.qq.com/?_wv=1027&k=aCvMxdr3) 以及 [MMSegmentation](https://jq.qq.com/?_wv=1027&k=ukevz6Ie) 的 QQ 群。
+扫描下方的二维码可关注 OpenMMLab 团队的 [知乎官方账号](https://www.zhihu.com/people/openmmlab)，扫描下方微信二维码添加喵喵好友，进入 MMSegmentation 微信交流社群。【加好友申请格式：研究方向+地区+学校/公司+姓名】
 
 <div align="center">
- <img src="docs/zh_cn/imgs/zhihu_qrcode.jpg" height="400" />  <img src="docs/zh_cn/imgs/qq_group_qrcode.jpg" height="400" />  <img src="docs/zh_cn/imgs/seggroup_qrcode.jpg" height="400" />
- </div>
+<img src="docs/zh_cn/imgs/zhihu_qrcode.jpg" height="400" />  <img src="resources/miaomiao_qrcode.jpg" height="400" />
+</div>
 
 我们会在 OpenMMLab 社区为大家
 
diff --git a/configs/_base_/datasets/ade20k.py b/configs/_base_/datasets/ade20k.py
index 4303b094c5..48340d11ee 100644
--- a/configs/_base_/datasets/ade20k.py
+++ b/configs/_base_/datasets/ade20k.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations', reduce_zero_label=True),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/datasets/ade20k_640x640.py b/configs/_base_/datasets/ade20k_640x640.py
index 8478585915..c1f642da7f 100644
--- a/configs/_base_/datasets/ade20k_640x640.py
+++ b/configs/_base_/datasets/ade20k_640x640.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations', reduce_zero_label=True),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/datasets/bdd100k.py b/configs/_base_/datasets/bdd100k.py
new file mode 100644
index 0000000000..24cec69bfe
--- /dev/null
+++ b/configs/_base_/datasets/bdd100k.py
@@ -0,0 +1,70 @@
+# dataset settings
+dataset_type = 'BDD100KDataset'
+data_root = 'data/bdd100k/'
+
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/10k/train',
+            seg_map_path='labels/sem_seg/masks/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/10k/val',
+            seg_map_path='labels/sem_seg/masks/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/chase_db1.py b/configs/_base_/datasets/chase_db1.py
index 1622bec957..ed47c2dbe5 100644
--- a/configs/_base_/datasets/chase_db1.py
+++ b/configs/_base_/datasets/chase_db1.py
@@ -24,6 +24,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 
 train_dataloader = dict(
     batch_size=4,
@@ -55,5 +71,5 @@
         pipeline=test_pipeline))
 test_dataloader = val_dataloader
 
-val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
 test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/cityscapes.py b/configs/_base_/datasets/cityscapes.py
index c2fdee473b..b63a4cdfe7 100644
--- a/configs/_base_/datasets/cityscapes.py
+++ b/configs/_base_/datasets/cityscapes.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=2,
     num_workers=2,
diff --git a/configs/_base_/datasets/coco-stuff10k.py b/configs/_base_/datasets/coco-stuff10k.py
index b00db24691..5d6bb12b97 100644
--- a/configs/_base_/datasets/coco-stuff10k.py
+++ b/configs/_base_/datasets/coco-stuff10k.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations', reduce_zero_label=True),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/datasets/coco-stuff164k.py b/configs/_base_/datasets/coco-stuff164k.py
index e879bdb2aa..a9b9d90117 100644
--- a/configs/_base_/datasets/coco-stuff164k.py
+++ b/configs/_base_/datasets/coco-stuff164k.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
@@ -32,7 +48,7 @@
         type=dataset_type,
         data_root=data_root,
         data_prefix=dict(
-            img_path='images/train2017', seg_map_path='annotations/val2017'),
+            img_path='images/train2017', seg_map_path='annotations/train2017'),
         pipeline=train_pipeline))
 val_dataloader = dict(
     batch_size=1,
diff --git a/configs/_base_/datasets/drive.py b/configs/_base_/datasets/drive.py
index 523354d059..6a3dd82c64 100644
--- a/configs/_base_/datasets/drive.py
+++ b/configs/_base_/datasets/drive.py
@@ -24,6 +24,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
@@ -53,5 +69,5 @@
         pipeline=test_pipeline))
 test_dataloader = val_dataloader
 
-val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
 test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/hrf.py b/configs/_base_/datasets/hrf.py
index 3ca2263abd..353d070472 100644
--- a/configs/_base_/datasets/hrf.py
+++ b/configs/_base_/datasets/hrf.py
@@ -24,6 +24,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
@@ -53,5 +69,5 @@
         pipeline=test_pipeline))
 test_dataloader = val_dataloader
 
-val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
 test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/isaid.py b/configs/_base_/datasets/isaid.py
index 8dafae8fd4..5cd4309f6d 100644
--- a/configs/_base_/datasets/isaid.py
+++ b/configs/_base_/datasets/isaid.py
@@ -30,6 +30,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/datasets/levir_256x256.py b/configs/_base_/datasets/levir_256x256.py
new file mode 100644
index 0000000000..a2a69aa9e9
--- /dev/null
+++ b/configs/_base_/datasets/levir_256x256.py
@@ -0,0 +1,59 @@
+# dataset settings
+dataset_type = 'LEVIRCDDataset'
+data_root = r'data/LEVIRCD'
+
+albu_train_transforms = [
+    dict(type='RandomBrightnessContrast', p=0.2),
+    dict(type='HorizontalFlip', p=0.5),
+    dict(type='VerticalFlip', p=0.5)
+]
+
+train_pipeline = [
+    dict(type='LoadMultipleRSImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Albu', transforms=albu_train_transforms),
+    dict(type='ConcatCDInput'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadMultipleRSImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='ConcatCDInput'),
+    dict(type='PackSegInputs')
+]
+
+tta_pipeline = [
+    dict(type='LoadMultipleRSImageFromFile'),
+    dict(
+        type='TestTimeAug',
+        transforms=[[dict(type='LoadAnnotations')],
+                    [dict(type='ConcatCDInput')],
+                    [dict(type='PackSegInputs')]])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train/A',
+            img_path2='train/B',
+            seg_map_path='train/label'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='test/A', img_path2='test/B', seg_map_path='test/label'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/loveda.py b/configs/_base_/datasets/loveda.py
index fcdb05865e..b93bc74af1 100644
--- a/configs/_base_/datasets/loveda.py
+++ b/configs/_base_/datasets/loveda.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations', reduce_zero_label=True),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/datasets/mapillary_v1.py b/configs/_base_/datasets/mapillary_v1.py
new file mode 100644
index 0000000000..611aa4741b
--- /dev/null
+++ b/configs/_base_/datasets/mapillary_v1.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'MapillaryDataset_v1'
+data_root = 'data/mapillary/'
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='training/images', seg_map_path='training/v1.2/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='validation/images',
+            seg_map_path='validation/v1.2/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/mapillary_v1_65.py b/configs/_base_/datasets/mapillary_v1_65.py
new file mode 100644
index 0000000000..f594f37333
--- /dev/null
+++ b/configs/_base_/datasets/mapillary_v1_65.py
@@ -0,0 +1,37 @@
+# dataset settings
+_base_ = './mapillary_v1.py'
+metainfo = dict(
+    classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
+             'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking',
+             'Pedestrian Area', 'Rail Track', 'Road', 'Service Lane',
+             'Sidewalk', 'Bridge', 'Building', 'Tunnel', 'Person', 'Bicyclist',
+             'Motorcyclist', 'Other Rider', 'Lane Marking - Crosswalk',
+             'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow',
+             'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack',
+             'Billboard', 'Catch Basin', 'CCTV Camera', 'Fire Hydrant',
+             'Junction Box', 'Mailbox', 'Manhole', 'Phone Booth', 'Pothole',
+             'Street Light', 'Pole', 'Traffic Sign Frame', 'Utility Pole',
+             'Traffic Light', 'Traffic Sign (Back)', 'Traffic Sign (Front)',
+             'Trash Can', 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan',
+             'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer', 'Truck',
+             'Wheeled Slow', 'Car Mount', 'Ego Vehicle'),
+    palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+             [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
+             [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
+             [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
+             [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
+             [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
+             [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
+             [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
+             [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
+             [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
+             [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
+             [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
+             [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
+             [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142],
+             [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+             [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10, 10]])
+
+train_dataloader = dict(dataset=dict(metainfo=metainfo))
+val_dataloader = dict(dataset=dict(metainfo=metainfo))
+test_dataloader = val_dataloader
diff --git a/configs/_base_/datasets/mapillary_v2.py b/configs/_base_/datasets/mapillary_v2.py
new file mode 100644
index 0000000000..7cb7a958e5
--- /dev/null
+++ b/configs/_base_/datasets/mapillary_v2.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'MapillaryDataset_v2'
+data_root = 'data/mapillary/'
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='training/images', seg_map_path='training/v2.0/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='validation/images',
+            seg_map_path='validation/v2.0/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/nyu.py b/configs/_base_/datasets/nyu.py
new file mode 100644
index 0000000000..74d57c5fc5
--- /dev/null
+++ b/configs/_base_/datasets/nyu.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'NYUDataset'
+data_root = 'data/nyu'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3),
+    dict(type='RandomDepthMix', prob=0.25),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='RandomCrop', crop_size=(480, 480)),
+    dict(
+        type='Albu',
+        transforms=[
+            dict(type='RandomBrightnessContrast'),
+            dict(type='RandomGamma'),
+            dict(type='HueSaturationValue'),
+        ]),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id')),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2000, 480), keep_ratio=True),
+    dict(dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3)),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id'))
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/train', depth_map_path='annotations/train'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        test_mode=True,
+        data_prefix=dict(
+            img_path='images/test', depth_map_path='annotations/test'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='DepthMetric',
+    min_depth_eval=0.001,
+    max_depth_eval=10.0,
+    crop_type='nyu_crop')
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/nyu_512x512.py b/configs/_base_/datasets/nyu_512x512.py
new file mode 100644
index 0000000000..88e3878d33
--- /dev/null
+++ b/configs/_base_/datasets/nyu_512x512.py
@@ -0,0 +1,72 @@
+# dataset settings
+dataset_type = 'NYUDataset'
+data_root = 'data/nyu'
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3),
+    dict(type='RandomDepthMix', prob=0.25),
+    dict(type='RandomFlip', prob=0.5),
+    dict(
+        type='RandomResize',
+        scale=(768, 512),
+        ratio_range=(0.8, 1.5),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=(512, 512)),
+    dict(
+        type='Albu',
+        transforms=[
+            dict(type='RandomBrightnessContrast'),
+            dict(type='RandomGamma'),
+            dict(type='HueSaturationValue'),
+        ]),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id')),
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    dict(dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3)),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id'))
+]
+
+train_dataloader = dict(
+    batch_size=8,
+    num_workers=8,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/train', depth_map_path='annotations/train'),
+        pipeline=train_pipeline))
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        test_mode=True,
+        data_prefix=dict(
+            img_path='images/test', depth_map_path='annotations/test'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='DepthMetric',
+    min_depth_eval=0.001,
+    max_depth_eval=10.0,
+    crop_type='nyu_crop')
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/pascal_context_59.py b/configs/_base_/datasets/pascal_context_59.py
index 9103fe7e3f..7f31043ed0 100644
--- a/configs/_base_/datasets/pascal_context_59.py
+++ b/configs/_base_/datasets/pascal_context_59.py
@@ -26,6 +26,22 @@
     dict(type='LoadAnnotations', reduce_zero_label=True),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/datasets/pascal_voc12.py b/configs/_base_/datasets/pascal_voc12.py
index aeb38d0613..5235ca9cfe 100644
--- a/configs/_base_/datasets/pascal_voc12.py
+++ b/configs/_base_/datasets/pascal_voc12.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/datasets/pascal_voc12_aug.py b/configs/_base_/datasets/pascal_voc12_aug.py
index cd0d3e8682..69c3654880 100644
--- a/configs/_base_/datasets/pascal_voc12_aug.py
+++ b/configs/_base_/datasets/pascal_voc12_aug.py
@@ -25,7 +25,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
-
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 dataset_train = dict(
     type=dataset_type,
     data_root=data_root,
diff --git a/configs/_base_/datasets/potsdam.py b/configs/_base_/datasets/potsdam.py
index ef9761c76e..95f6039351 100644
--- a/configs/_base_/datasets/potsdam.py
+++ b/configs/_base_/datasets/potsdam.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations', reduce_zero_label=True),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/datasets/refuge.py b/configs/_base_/datasets/refuge.py
new file mode 100644
index 0000000000..79bb4d4e94
--- /dev/null
+++ b/configs/_base_/datasets/refuge.py
@@ -0,0 +1,90 @@
+# dataset settings
+dataset_type = 'REFUGEDataset'
+data_root = 'data/REFUGE'
+train_img_scale = (2056, 2124)
+val_img_scale = (1634, 1634)
+test_img_scale = (1634, 1634)
+crop_size = (512, 512)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(
+        type='RandomResize',
+        scale=train_img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+val_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=val_img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=test_img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=False),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/training', seg_map_path='annotations/training'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=val_pipeline))
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/test', seg_map_path='annotations/test'),
+        pipeline=val_pipeline))
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/stare.py b/configs/_base_/datasets/stare.py
index a904fa8fd4..b7545dc623 100644
--- a/configs/_base_/datasets/stare.py
+++ b/configs/_base_/datasets/stare.py
@@ -24,6 +24,22 @@
     dict(type='LoadAnnotations'),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
@@ -53,5 +69,5 @@
         pipeline=test_pipeline))
 test_dataloader = val_dataloader
 
-val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
 test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/synapse.py b/configs/_base_/datasets/synapse.py
new file mode 100644
index 0000000000..86852918cd
--- /dev/null
+++ b/configs/_base_/datasets/synapse.py
@@ -0,0 +1,41 @@
+dataset_type = 'SynapseDataset'
+data_root = 'data/synapse/'
+img_scale = (224, 224)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='RandomRotFlip', rotate_prob=0.5, flip_prob=0.5, degree=20),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=6,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='img_dir/train', seg_map_path='ann_dir/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mDice'])
+test_evaluator = val_evaluator
diff --git a/configs/_base_/datasets/vaihingen.py b/configs/_base_/datasets/vaihingen.py
index 2b52135567..6c78994fe7 100644
--- a/configs/_base_/datasets/vaihingen.py
+++ b/configs/_base_/datasets/vaihingen.py
@@ -23,6 +23,22 @@
     dict(type='LoadAnnotations', reduce_zero_label=True),
     dict(type='PackSegInputs')
 ]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
 train_dataloader = dict(
     batch_size=4,
     num_workers=4,
diff --git a/configs/_base_/default_runtime.py b/configs/_base_/default_runtime.py
index 5925c69267..272b4d2467 100644
--- a/configs/_base_/default_runtime.py
+++ b/configs/_base_/default_runtime.py
@@ -4,6 +4,12 @@
     mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
     dist_cfg=dict(backend='nccl'),
 )
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(by_epoch=False)
 log_level = 'INFO'
 load_from = None
 resume = False
+
+tta_model = dict(type='SegTTAModel')
diff --git a/configs/_base_/models/fpn_poolformer_s12.py b/configs/_base_/models/fpn_poolformer_s12.py
new file mode 100644
index 0000000000..086c804837
--- /dev/null
+++ b/configs/_base_/models/fpn_poolformer_s12.py
@@ -0,0 +1,54 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s12_3rdparty_32xb128_in1k_20220414-f8d83051.pth'  # noqa
+# TODO: delete custom_imports after mmpretrain supports auto import
+# please install mmpretrain >= 1.0.0rc7
+# import mmpretrain.models to trigger register_module in mmpretrain
+custom_imports = dict(
+    imports=['mmpretrain.models'], allow_failed_imports=False)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.PoolFormer',
+        arch='s12',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file, prefix='backbone.'),
+        in_patch_size=7,
+        in_stride=4,
+        in_pad=2,
+        down_patch_size=3,
+        down_stride=2,
+        down_pad=1,
+        drop_rate=0.,
+        drop_path_rate=0.,
+        out_indices=(0, 2, 4, 6),
+        frozen_stages=0,
+    ),
+    neck=dict(
+        type='FPN',
+        in_channels=[256, 512, 1024, 2048],
+        out_channels=256,
+        num_outs=4),
+    decode_head=dict(
+        type='FPNHead',
+        in_channels=[256, 256, 256, 256],
+        in_index=[0, 1, 2, 3],
+        feature_strides=[4, 8, 16, 32],
+        channels=128,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/_base_/models/san_vit-b16.py b/configs/_base_/models/san_vit-b16.py
new file mode 100644
index 0000000000..96ac41b8da
--- /dev/null
+++ b/configs/_base_/models/san_vit-b16.py
@@ -0,0 +1,137 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[122.7709, 116.7460, 104.0937],
+    std=[68.5005, 66.6322, 70.3232],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size_divisor=640,
+    test_cfg=dict(size_divisor=32))
+
+num_classes = 171
+model = dict(
+    type='MultimodalEncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/clip_vit_base_patch16_224.pth',
+    asymetric_input=True,
+    encoder_resolution=0.5,
+    image_encoder=dict(
+        type='VisionTransformer',
+        img_size=(224, 224),
+        patch_size=16,
+        patch_pad=0,
+        in_channels=3,
+        embed_dims=768,
+        num_layers=9,
+        num_heads=12,
+        mlp_ratio=4,
+        out_origin=True,
+        out_indices=(2, 5, 8),
+        qkv_bias=True,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        with_cls_token=True,
+        output_cls_token=True,
+        patch_bias=False,
+        pre_norm=True,
+        norm_cfg=dict(type='LN', eps=1e-5),
+        act_cfg=dict(type='QuickGELU'),
+        norm_eval=False,
+        interpolate_mode='bicubic',
+        frozen_exclude=['pos_embed']),
+    text_encoder=dict(
+        type='CLIPTextEncoder',
+        dataset_name=None,
+        templates='vild',
+        embed_dims=512,
+        num_layers=12,
+        num_heads=8,
+        mlp_ratio=4,
+        output_dims=512,
+        cache_feature=True,
+        cat_bg=True,
+        norm_cfg=dict(type='LN', eps=1e-5)
+        ),
+    decode_head=dict(
+        type='SideAdapterCLIPHead',
+        num_classes=num_classes,
+        deep_supervision_idxs=[7],
+        san_cfg=dict(
+            in_channels=3,
+            clip_channels=768,
+            embed_dims=240,
+            patch_size=16,
+            patch_bias=True,
+            num_queries=100,
+            cfg_encoder=dict(
+                num_encode_layer=8,
+                num_heads=6,
+                mlp_ratio=4
+            ),
+            fusion_index=[0, 1, 2, 3],
+            cfg_decoder=dict(
+                num_heads=12,
+                num_layers=1,
+                embed_channels=256,
+                mlp_channels=256,
+                num_mlp=3,
+                rescale=True),
+            norm_cfg=dict(type='LN', eps=1e-6),
+        ),
+        maskgen_cfg=dict(
+            sos_token_format='cls_token',
+            sos_token_num=100,
+            cross_attn=False,
+            num_layers=3,
+            embed_dims=768,
+            num_heads=12,
+            mlp_ratio=4,
+            qkv_bias=True,
+            out_dims=512,
+            final_norm=True,
+            act_cfg=dict(type='QuickGELU'),
+            norm_cfg=dict(type='LN', eps=1e-5),
+            frozen_exclude=[]
+        ),
+        align_corners=False,
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='HungarianAssigner',
+                match_costs=[
+                    dict(type='ClassificationCost', weight=2.0),
+                    dict(
+                        type='CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ])),
+        loss_decode=[dict(type='CrossEntropyLoss',
+                          loss_name='loss_cls_ce',
+                          loss_weight=2.0,
+                          class_weight=[1.0] * num_classes + [0.1]),
+                     dict(type='CrossEntropyLoss',
+                          use_sigmoid=True,
+                          loss_name='loss_mask_ce',
+                          loss_weight=5.0),
+                     dict(type='DiceLoss',
+                          ignore_index=None,
+                          naive_dice=True,
+                          eps=1,
+                          loss_name='loss_mask_dice',
+                          loss_weight=5.0)
+                     ]),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))  # yapf: disable
diff --git a/configs/_base_/models/upernet_convnext.py b/configs/_base_/models/upernet_convnext.py
index 7595295871..958994c91e 100644
--- a/configs/_base_/models/upernet_convnext.py
+++ b/configs/_base_/models/upernet_convnext.py
@@ -1,5 +1,5 @@
 norm_cfg = dict(type='SyncBN', requires_grad=True)
-custom_imports = dict(imports='mmcls.models', allow_failed_imports=False)
+custom_imports = dict(imports='mmpretrain.models', allow_failed_imports=False)
 checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_32xb128-noema_in1k_20220301-2a0ee547.pth'  # noqa
 data_preprocessor = dict(
     type='SegDataPreProcessor',
@@ -13,7 +13,7 @@
     data_preprocessor=data_preprocessor,
     pretrained=None,
     backbone=dict(
-        type='mmcls.ConvNeXt',
+        type='mmpretrain.ConvNeXt',
         arch='base',
         out_indices=[0, 1, 2, 3],
         drop_path_rate=0.4,
diff --git a/configs/_base_/models/vpd_sd.py b/configs/_base_/models/vpd_sd.py
new file mode 100644
index 0000000000..87321e74f0
--- /dev/null
+++ b/configs/_base_/models/vpd_sd.py
@@ -0,0 +1,86 @@
+# model settings
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[127.5, 127.5, 127.5],
+    std=[127.5, 127.5, 127.5],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=0)
+
+# adapted from stable-diffusion/configs/stable-diffusion/v1-inference.yaml
+stable_diffusion_cfg = dict(
+    base_learning_rate=0.0001,
+    target='ldm.models.diffusion.ddpm.LatentDiffusion',
+    checkpoint='https://download.openmmlab.com/mmsegmentation/v0.5/'
+    'vpd/stable_diffusion_v1-5_pretrain_third_party.pth',
+    params=dict(
+        linear_start=0.00085,
+        linear_end=0.012,
+        num_timesteps_cond=1,
+        log_every_t=200,
+        timesteps=1000,
+        first_stage_key='jpg',
+        cond_stage_key='txt',
+        image_size=64,
+        channels=4,
+        cond_stage_trainable=False,
+        conditioning_key='crossattn',
+        monitor='val/loss_simple_ema',
+        scale_factor=0.18215,
+        use_ema=False,
+        scheduler_config=dict(
+            target='ldm.lr_scheduler.LambdaLinearScheduler',
+            params=dict(
+                warm_up_steps=[10000],
+                cycle_lengths=[10000000000000],
+                f_start=[1e-06],
+                f_max=[1.0],
+                f_min=[1.0])),
+        unet_config=dict(
+            target='ldm.modules.diffusionmodules.openaimodel.UNetModel',
+            params=dict(
+                image_size=32,
+                in_channels=4,
+                out_channels=4,
+                model_channels=320,
+                attention_resolutions=[4, 2, 1],
+                num_res_blocks=2,
+                channel_mult=[1, 2, 4, 4],
+                num_heads=8,
+                use_spatial_transformer=True,
+                transformer_depth=1,
+                context_dim=768,
+                use_checkpoint=True,
+                legacy=False)),
+        first_stage_config=dict(
+            target='ldm.models.autoencoder.AutoencoderKL',
+            params=dict(
+                embed_dim=4,
+                monitor='val/rec_loss',
+                ddconfig=dict(
+                    double_z=True,
+                    z_channels=4,
+                    resolution=256,
+                    in_channels=3,
+                    out_ch=3,
+                    ch=128,
+                    ch_mult=[1, 2, 4, 4],
+                    num_res_blocks=2,
+                    attn_resolutions=[],
+                    dropout=0.0),
+                lossconfig=dict(target='torch.nn.Identity'))),
+        cond_stage_config=dict(
+            target='ldm.modules.encoders.modules.AbstractEncoder')))
+
+model = dict(
+    type='DepthEstimator',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='VPD',
+        diffusion_cfg=stable_diffusion_cfg,
+    ),
+)
+
+# some of the parameters in stable-diffusion model will not be updated
+# during training
+find_unused_parameters = True
diff --git a/configs/_base_/schedules/schedule_160k.py b/configs/_base_/schedules/schedule_160k.py
index 1055958a31..60d7bec762 100644
--- a/configs/_base_/schedules/schedule_160k.py
+++ b/configs/_base_/schedules/schedule_160k.py
@@ -18,8 +18,8 @@
 test_cfg = dict(type='TestLoop')
 default_hooks = dict(
     timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=50),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
     param_scheduler=dict(type='ParamSchedulerHook'),
     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=16000),
     sampler_seed=dict(type='DistSamplerSeedHook'),
-)
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/configs/_base_/schedules/schedule_20k.py b/configs/_base_/schedules/schedule_20k.py
index b4bc083def..e809e3e880 100644
--- a/configs/_base_/schedules/schedule_20k.py
+++ b/configs/_base_/schedules/schedule_20k.py
@@ -17,8 +17,8 @@
 test_cfg = dict(type='TestLoop')
 default_hooks = dict(
     timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=50),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
     param_scheduler=dict(type='ParamSchedulerHook'),
     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
     sampler_seed=dict(type='DistSamplerSeedHook'),
-)
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/configs/_base_/schedules/schedule_240k.py b/configs/_base_/schedules/schedule_240k.py
new file mode 100644
index 0000000000..feb2ce9637
--- /dev/null
+++ b/configs/_base_/schedules/schedule_240k.py
@@ -0,0 +1,25 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=240000,
+        by_epoch=False)
+]
+# training schedule for 240k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=240000, val_interval=24000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=24000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/configs/_base_/schedules/schedule_25k.py b/configs/_base_/schedules/schedule_25k.py
new file mode 100644
index 0000000000..825e141ed1
--- /dev/null
+++ b/configs/_base_/schedules/schedule_25k.py
@@ -0,0 +1,28 @@
+# optimizer
+optimizer = dict(type='AdamW', lr=0.001, weight_decay=0.1)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=3e-2, begin=0, end=12000,
+        by_epoch=False),
+    dict(
+        type='PolyLRRatio',
+        eta_min_ratio=3e-2,
+        power=0.9,
+        begin=12000,
+        end=24000,
+        by_epoch=False),
+    dict(type='ConstantLR', by_epoch=False, factor=1, begin=24000, end=25000)
+]
+# training schedule for 25k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=25000, val_interval=1000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/configs/_base_/schedules/schedule_320k.py b/configs/_base_/schedules/schedule_320k.py
index 1c6f3500e6..70b063afc9 100644
--- a/configs/_base_/schedules/schedule_320k.py
+++ b/configs/_base_/schedules/schedule_320k.py
@@ -18,8 +18,8 @@
 test_cfg = dict(type='TestLoop')
 default_hooks = dict(
     timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=50),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
     param_scheduler=dict(type='ParamSchedulerHook'),
     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=32000),
     sampler_seed=dict(type='DistSamplerSeedHook'),
-)
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/configs/_base_/schedules/schedule_40k.py b/configs/_base_/schedules/schedule_40k.py
index d768012e82..4b823339a2 100644
--- a/configs/_base_/schedules/schedule_40k.py
+++ b/configs/_base_/schedules/schedule_40k.py
@@ -17,8 +17,8 @@
 test_cfg = dict(type='TestLoop')
 default_hooks = dict(
     timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=50),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
     param_scheduler=dict(type='ParamSchedulerHook'),
     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
     sampler_seed=dict(type='DistSamplerSeedHook'),
-)
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/configs/_base_/schedules/schedule_80k.py b/configs/_base_/schedules/schedule_80k.py
index 170a2ecb81..0dcd6c4d1b 100644
--- a/configs/_base_/schedules/schedule_80k.py
+++ b/configs/_base_/schedules/schedule_80k.py
@@ -17,8 +17,8 @@
 test_cfg = dict(type='TestLoop')
 default_hooks = dict(
     timer=dict(type='IterTimerHook'),
-    logger=dict(type='LoggerHook', interval=50),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
     param_scheduler=dict(type='ParamSchedulerHook'),
     checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=8000),
     sampler_seed=dict(type='DistSamplerSeedHook'),
-)
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/configs/ann/README.md b/configs/ann/README.md
index ba4cfe2595..1281a9ee14 100644
--- a/configs/ann/README.md
+++ b/configs/ann/README.md
@@ -1,6 +1,6 @@
 # ANN
 
-[Asymmetric Non-local Neural Networks for Semantic Segmentation](https://arxiv.org/abs/1908.07678)
+> [Asymmetric Non-local Neural Networks for Semantic Segmentation](https://arxiv.org/abs/1908.07678)
 
 ## Introduction
 
@@ -22,6 +22,39 @@ The non-local module works as a particularly useful technique for semantic segme
 <img src="https://user-images.githubusercontent.com/24582831/142898322-3bbd578c-e488-4bae-9c14-7598adac5cbd.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ANN    | R-50-D8  | 512x1024  |   40000 | 6        | 3.71           | V100   | 77.40 |         78.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211.log.json)     |
+| ANN    | R-101-D8 | 512x1024  |   40000 | 9.5      | 2.55           | V100   | 76.55 |         78.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243-adf6eece.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243.log.json) |
+| ANN    | R-50-D8  | 769x769   |   40000 | 6.8      | 1.70           | V100   | 78.89 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712-2b46b04d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712.log.json)         |
+| ANN    | R-101-D8 | 769x769   |   40000 | 10.7     | 1.15           | V100   | 79.32 |         80.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720-059bff28.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720.log.json)     |
+| ANN    | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 77.34 |         78.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911-5a9ad545.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911.log.json)     |
+| ANN    | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 77.14 |         78.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728-aceccc6e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728.log.json) |
+| ANN    | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 78.88 |         80.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426-cc7ff323.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426.log.json)         |
+| ANN    | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 78.80 |         80.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713-a9d4be8d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ANN    | R-50-D8  | 512x512   |   80000 | 9.1      | 21.01          | V100   | 41.01 |         42.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818-26f75e11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818.log.json)         |
+| ANN    | R-101-D8 | 512x512   |   80000 | 12.5     | 14.12          | V100   | 42.94 |         44.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818-c0153543.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818.log.json)     |
+| ANN    | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 41.74 |         42.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733-892247bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733.log.json)     |
+| ANN    | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 42.94 |         44.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733-955eb1ec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                   |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ANN    | R-50-D8  | 512x512   |   20000 | 6        | 20.92          | V100   | 74.86 |         76.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246-dfcb1c62.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246.log.json)     |
+| ANN    | R-101-D8 | 512x512   |   20000 | 9.5      | 13.94          | V100   | 77.47 |         78.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246-2fad0042.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246.log.json) |
+| ANN    | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.56 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314-b5dac322.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314.log.json)     |
+| ANN    | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 76.70 |         78.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314-bd205bbe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,36 +66,3 @@ The non-local module works as a particularly useful technique for semantic segme
   year={2019}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                               |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ANN    | R-50-D8  | 512x1024  |   40000 | 6        | 3.71           | 77.40 |         78.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211.log.json)     |
-| ANN    | R-101-D8 | 512x1024  |   40000 | 9.5      | 2.55           | 76.55 |         78.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243-adf6eece.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243.log.json) |
-| ANN    | R-50-D8  | 769x769   |   40000 | 6.8      | 1.70           | 78.89 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712-2b46b04d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712.log.json)         |
-| ANN    | R-101-D8 | 769x769   |   40000 | 10.7     | 1.15           | 79.32 |         80.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720-059bff28.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720.log.json)     |
-| ANN    | R-50-D8  | 512x1024  |   80000 | -        | -              | 77.34 |         78.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911-5a9ad545.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911.log.json)     |
-| ANN    | R-101-D8 | 512x1024  |   80000 | -        | -              | 77.14 |         78.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728-aceccc6e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728.log.json) |
-| ANN    | R-50-D8  | 769x769   |   80000 | -        | -              | 78.88 |         80.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426-cc7ff323.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426.log.json)         |
-| ANN    | R-101-D8 | 769x769   |   80000 | -        | -              | 78.80 |         80.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713-a9d4be8d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713.log.json)     |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                            | download                                                                                                                                                                                                                                                                                                               |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ANN    | R-50-D8  | 512x512   |   80000 | 9.1      | 21.01          | 41.01 |         42.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818-26f75e11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818.log.json)         |
-| ANN    | R-101-D8 | 512x512   |   80000 | 12.5     | 14.12          | 42.94 |         44.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818-c0153543.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818.log.json)     |
-| ANN    | R-50-D8  | 512x512   |  160000 | -        | -              | 41.74 |         42.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733-892247bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733.log.json)     |
-| ANN    | R-101-D8 | 512x512   |  160000 | -        | -              | 42.94 |         44.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733-955eb1ec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733.log.json) |
-
-### Pascal VOC 2012 + Aug
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                             | download                                                                                                                                                                                                                                                                                                                   |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ANN    | R-50-D8  | 512x512   |   20000 | 6        | 20.92          | 74.86 |         76.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246-dfcb1c62.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246.log.json)     |
-| ANN    | R-101-D8 | 512x512   |   20000 | 9.5      | 13.94          | 77.47 |         78.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246-2fad0042.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246.log.json) |
-| ANN    | R-50-D8  | 512x512   |   40000 | -        | -              | 76.56 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314-b5dac322.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314.log.json)     |
-| ANN    | R-101-D8 | 512x512   |   40000 | -        | -              | 76.70 |         78.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann/ann_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314-bd205bbe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314.log.json) |
diff --git a/configs/ann/ann.yml b/configs/ann/ann.yml
deleted file mode 100644
index ff6bea653b..0000000000
--- a/configs/ann/ann.yml
+++ /dev/null
@@ -1,305 +0,0 @@
-Collections:
-- Name: ANN
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://arxiv.org/abs/1908.07678
-    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
-  README: configs/ann/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/MendelXu/ANN
-Models:
-- Name: ann_r50-d8_512x1024_40k_cityscapes
-  In Collection: ANN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 269.54
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.4
-      mIoU(ms+flip): 78.57
-  Config: configs/ann/ann_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth
-- Name: ann_r101-d8_512x1024_40k_cityscapes
-  In Collection: ANN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 392.16
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.55
-      mIoU(ms+flip): 78.85
-  Config: configs/ann/ann_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243-adf6eece.pth
-- Name: ann_r50-d8_769x769_40k_cityscapes
-  In Collection: ANN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 588.24
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.89
-      mIoU(ms+flip): 80.46
-  Config: configs/ann/ann_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712-2b46b04d.pth
-- Name: ann_r101-d8_769x769_40k_cityscapes
-  In Collection: ANN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 869.57
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.32
-      mIoU(ms+flip): 80.94
-  Config: configs/ann/ann_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720-059bff28.pth
-- Name: ann_r50-d8_512x1024_80k_cityscapes
-  In Collection: ANN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.34
-      mIoU(ms+flip): 78.65
-  Config: configs/ann/ann_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911-5a9ad545.pth
-- Name: ann_r101-d8_512x1024_80k_cityscapes
-  In Collection: ANN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.14
-      mIoU(ms+flip): 78.81
-  Config: configs/ann/ann_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728-aceccc6e.pth
-- Name: ann_r50-d8_769x769_80k_cityscapes
-  In Collection: ANN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.88
-      mIoU(ms+flip): 80.57
-  Config: configs/ann/ann_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426-cc7ff323.pth
-- Name: ann_r101-d8_769x769_80k_cityscapes
-  In Collection: ANN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.8
-      mIoU(ms+flip): 80.34
-  Config: configs/ann/ann_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713-a9d4be8d.pth
-- Name: ann_r50-d8_512x512_80k_ade20k
-  In Collection: ANN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 47.6
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.01
-      mIoU(ms+flip): 42.3
-  Config: configs/ann/ann_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818-26f75e11.pth
-- Name: ann_r101-d8_512x512_80k_ade20k
-  In Collection: ANN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 70.82
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.94
-      mIoU(ms+flip): 44.18
-  Config: configs/ann/ann_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818-c0153543.pth
-- Name: ann_r50-d8_512x512_160k_ade20k
-  In Collection: ANN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.74
-      mIoU(ms+flip): 42.62
-  Config: configs/ann/ann_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733-892247bc.pth
-- Name: ann_r101-d8_512x512_160k_ade20k
-  In Collection: ANN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.94
-      mIoU(ms+flip): 44.06
-  Config: configs/ann/ann_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733-955eb1ec.pth
-- Name: ann_r50-d8_512x512_20k_voc12aug
-  In Collection: ANN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 47.8
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 74.86
-      mIoU(ms+flip): 76.13
-  Config: configs/ann/ann_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246-dfcb1c62.pth
-- Name: ann_r101-d8_512x512_20k_voc12aug
-  In Collection: ANN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 71.74
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.47
-      mIoU(ms+flip): 78.7
-  Config: configs/ann/ann_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246-2fad0042.pth
-- Name: ann_r50-d8_512x512_40k_voc12aug
-  In Collection: ANN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.56
-      mIoU(ms+flip): 77.51
-  Config: configs/ann/ann_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314-b5dac322.pth
-- Name: ann_r101-d8_512x512_40k_voc12aug
-  In Collection: ANN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.7
-      mIoU(ms+flip): 78.06
-  Config: configs/ann/ann_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314-bd205bbe.pth
diff --git a/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..0da7e0b702
--- /dev/null
+++ b/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..08459c0a50
--- /dev/null
+++ b/configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..46781fa9f2
--- /dev/null
+++ b/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..c951d8704c
--- /dev/null
+++ b/configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..9f14327542
--- /dev/null
+++ b/configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..c3c1a3f706
--- /dev/null
+++ b/configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..c3c1a3f706
--- /dev/null
+++ b/configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..3cc5b8e300
--- /dev/null
+++ b/configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ann_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_512x1024_40k_cityscapes.py b/configs/ann/ann_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index d494e07333..0000000000
--- a/configs/ann/ann_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ann_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_512x1024_80k_cityscapes.py b/configs/ann/ann_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 1eeff0b030..0000000000
--- a/configs/ann/ann_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ann_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_512x512_160k_ade20k.py b/configs/ann/ann_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 9e43af541f..0000000000
--- a/configs/ann/ann_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ann_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_512x512_20k_voc12aug.py b/configs/ann/ann_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index d854f2e422..0000000000
--- a/configs/ann/ann_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ann_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_512x512_40k_voc12aug.py b/configs/ann/ann_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index 893c53b1ca..0000000000
--- a/configs/ann/ann_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ann_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_512x512_80k_ade20k.py b/configs/ann/ann_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index a64dac670e..0000000000
--- a/configs/ann/ann_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ann_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_769x769_40k_cityscapes.py b/configs/ann/ann_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 5950824849..0000000000
--- a/configs/ann/ann_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ann_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r101-d8_769x769_80k_cityscapes.py b/configs/ann/ann_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index a9c712d1cc..0000000000
--- a/configs/ann/ann_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ann_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ann/ann_r50-d8_512x1024_40k_cityscapes.py b/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ann/ann_r50-d8_512x1024_40k_cityscapes.py
rename to configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/ann/ann_r50-d8_769x769_40k_cityscapes.py b/configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/ann/ann_r50-d8_769x769_40k_cityscapes.py
rename to configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/ann/ann_r50-d8_512x1024_80k_cityscapes.py b/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ann/ann_r50-d8_512x1024_80k_cityscapes.py
rename to configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/ann/ann_r50-d8_769x769_80k_cityscapes.py b/configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/ann/ann_r50-d8_769x769_80k_cityscapes.py
rename to configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/ann/ann_r50-d8_512x512_160k_ade20k.py b/configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/ann/ann_r50-d8_512x512_160k_ade20k.py
rename to configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/ann/ann_r50-d8_512x512_20k_voc12aug.py b/configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/ann/ann_r50-d8_512x512_20k_voc12aug.py
rename to configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/ann/ann_r50-d8_512x512_40k_voc12aug.py b/configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/ann/ann_r50-d8_512x512_40k_voc12aug.py
rename to configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/ann/ann_r50-d8_512x512_80k_ade20k.py b/configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/ann/ann_r50-d8_512x512_80k_ade20k.py
rename to configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/ann/metafile.yaml b/configs/ann/metafile.yaml
new file mode 100644
index 0000000000..0d118681fd
--- /dev/null
+++ b/configs/ann/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: ANN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  README: configs/ann/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: ann_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.4
+      mIoU(ms+flip): 78.57
+  Config: configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.55
+      mIoU(ms+flip): 78.85
+  Config: configs/ann/ann_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243-adf6eece.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_40k_cityscapes/ann_r101-d8_512x1024_40k_cityscapes_20200605_095243.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.89
+      mIoU(ms+flip): 80.46
+  Config: configs/ann/ann_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712-2b46b04d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_40k_cityscapes/ann_r50-d8_769x769_40k_cityscapes_20200530_025712.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.32
+      mIoU(ms+flip): 80.94
+  Config: configs/ann/ann_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720-059bff28.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_40k_cityscapes/ann_r101-d8_769x769_40k_cityscapes_20200530_025720.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.34
+      mIoU(ms+flip): 78.65
+  Config: configs/ann/ann_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911-5a9ad545.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_80k_cityscapes/ann_r50-d8_512x1024_80k_cityscapes_20200607_101911.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.14
+      mIoU(ms+flip): 78.81
+  Config: configs/ann/ann_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728-aceccc6e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x1024_80k_cityscapes/ann_r101-d8_512x1024_80k_cityscapes_20200607_013728.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.88
+      mIoU(ms+flip): 80.57
+  Config: configs/ann/ann_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426-cc7ff323.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_769x769_80k_cityscapes/ann_r50-d8_769x769_80k_cityscapes_20200607_044426.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.8
+      mIoU(ms+flip): 80.34
+  Config: configs/ann/ann_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713-a9d4be8d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_769x769_80k_cityscapes/ann_r101-d8_769x769_80k_cityscapes_20200607_013713.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.01
+      mIoU(ms+flip): 42.3
+  Config: configs/ann/ann_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818-26f75e11.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_80k_ade20k/ann_r50-d8_512x512_80k_ade20k_20200615_014818.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.94
+      mIoU(ms+flip): 44.18
+  Config: configs/ann/ann_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818-c0153543.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_80k_ade20k/ann_r101-d8_512x512_80k_ade20k_20200615_014818.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.74
+      mIoU(ms+flip): 42.62
+  Config: configs/ann/ann_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733-892247bc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_160k_ade20k/ann_r50-d8_512x512_160k_ade20k_20200615_231733.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.94
+      mIoU(ms+flip): 44.06
+  Config: configs/ann/ann_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733-955eb1ec.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_160k_ade20k/ann_r101-d8_512x512_160k_ade20k_20200615_231733.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.86
+      mIoU(ms+flip): 76.13
+  Config: configs/ann/ann_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246-dfcb1c62.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_20k_voc12aug/ann_r50-d8_512x512_20k_voc12aug_20200617_222246.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.47
+      mIoU(ms+flip): 78.7
+  Config: configs/ann/ann_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246-2fad0042.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_20k_voc12aug/ann_r101-d8_512x512_20k_voc12aug_20200617_222246.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.56
+      mIoU(ms+flip): 77.51
+  Config: configs/ann/ann_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314-b5dac322.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x512_40k_voc12aug/ann_r50-d8_512x512_40k_voc12aug_20200613_231314.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
+- Name: ann_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: ANN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.7
+      mIoU(ms+flip): 78.06
+  Config: configs/ann/ann_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ANN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314-bd205bbe.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r101-d8_512x512_40k_voc12aug/ann_r101-d8_512x512_40k_voc12aug_20200613_231314.log.json
+  Paper:
+    Title: Asymmetric Non-local Neural Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1908.07678
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ann_head.py#L185
+  Framework: PyTorch
diff --git a/configs/apcnet/README.md b/configs/apcnet/README.md
index f101a02d1d..9104f3c87f 100644
--- a/configs/apcnet/README.md
+++ b/configs/apcnet/README.md
@@ -1,6 +1,6 @@
 # APCNet
 
-[Adaptive Pyramid Context Network for Semantic Segmentation](https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html)
+> [Adaptive Pyramid Context Network for Semantic Segmentation](https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html)
 
 ## Introduction
 
@@ -22,6 +22,30 @@ Recent studies witnessed that context features can significantly improve the per
 <img src="https://user-images.githubusercontent.com/24582831/142898638-e1c0c6ae-9270-448e-aa01-bbac3a236db5.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| APCNet | R-50-D8  | 512x1024  |   40000 | 7.7      | 3.57           | V100   | 78.02 |         79.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes_20201214_115717-5e88fa33.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes-20201214_115717.log.json)     |
+| APCNet | R-101-D8 | 512x1024  |   40000 | 11.2     | 2.15           | V100   | 79.08 |         80.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes_20201214_115716-abc9d111.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes-20201214_115716.log.json) |
+| APCNet | R-50-D8  | 769x769   |   40000 | 8.7      | 1.52           | V100   | 77.89 |         79.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes_20201214_115717-2a2628d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes-20201214_115717.log.json)         |
+| APCNet | R-101-D8 | 769x769   |   40000 | 12.7     | 1.03           | V100   | 77.96 |         79.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes_20201214_115718-b650de90.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes-20201214_115718.log.json)     |
+| APCNet | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 78.96 |         79.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes_20201214_115716-987f51e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes-20201214_115716.log.json)     |
+| APCNet | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 79.64 |         80.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes_20201214_115705-b1ff208a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes-20201214_115705.log.json) |
+| APCNet | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 78.79 |         80.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes_20201214_115718-7ea9fa12.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes-20201214_115718.log.json)         |
+| APCNet | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 78.45 |         79.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes_20201214_115716-a7fbc2ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes-20201214_115716.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| APCNet | R-50-D8  | 512x512   |   80000 | 10.1     | 19.61          | V100   | 42.20 |         43.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k_20201214_115705-a8626293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k-20201214_115705.log.json)         |
+| APCNet | R-101-D8 | 512x512   |   80000 | 13.6     | 13.10          | V100   | 45.54 |         46.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k_20201214_115704-c656c3fb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k-20201214_115704.log.json)     |
+| APCNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 43.40 |         43.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k_20201214_115706-25fb92c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k-20201214_115706.log.json)     |
+| APCNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 45.41 |         46.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k_20201214_115705-73f9a8d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k-20201214_115705.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,27 +57,3 @@ month = {June},
 year = {2019}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| APCNet | R-50-D8  | 512x1024  |   40000 | 7.7      | 3.57           | 78.02 |         79.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes_20201214_115717-5e88fa33.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes-20201214_115717.log.json)     |
-| APCNet | R-101-D8 | 512x1024  |   40000 | 11.2     | 2.15           | 79.08 |         80.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes_20201214_115716-abc9d111.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes-20201214_115716.log.json) |
-| APCNet | R-50-D8  | 769x769   |   40000 | 8.7      | 1.52           | 77.89 |         79.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes_20201214_115717-2a2628d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes-20201214_115717.log.json)         |
-| APCNet | R-101-D8 | 769x769   |   40000 | 12.7     | 1.03           | 77.96 |         79.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes_20201214_115718-b650de90.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes-20201214_115718.log.json)     |
-| APCNet | R-50-D8  | 512x1024  |   80000 | -        | -              | 78.96 |         79.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes_20201214_115716-987f51e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes-20201214_115716.log.json)     |
-| APCNet | R-101-D8 | 512x1024  |   80000 | -        | -              | 79.64 |         80.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes_20201214_115705-b1ff208a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes-20201214_115705.log.json) |
-| APCNet | R-50-D8  | 769x769   |   80000 | -        | -              | 78.79 |         80.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes_20201214_115718-7ea9fa12.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes-20201214_115718.log.json)         |
-| APCNet | R-101-D8 | 769x769   |   80000 | -        | -              | 78.45 |         79.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes_20201214_115716-a7fbc2ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes-20201214_115716.log.json)     |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| APCNet | R-50-D8  | 512x512   |   80000 | 10.1     | 19.61          | 42.20 |         43.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k_20201214_115705-a8626293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k-20201214_115705.log.json)         |
-| APCNet | R-101-D8 | 512x512   |   80000 | 13.6     | 13.10          | 45.54 |         46.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k_20201214_115704-c656c3fb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k-20201214_115704.log.json)     |
-| APCNet | R-50-D8  | 512x512   |  160000 | -        | -              | 43.40 |         43.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k_20201214_115706-25fb92c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k-20201214_115706.log.json)     |
-| APCNet | R-101-D8 | 512x512   |  160000 | -        | -              | 45.41 |         46.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet/apcnet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k_20201214_115705-73f9a8d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k-20201214_115705.log.json) |
diff --git a/configs/apcnet/apcnet.yml b/configs/apcnet/apcnet.yml
deleted file mode 100644
index 7a453a3607..0000000000
--- a/configs/apcnet/apcnet.yml
+++ /dev/null
@@ -1,232 +0,0 @@
-Collections:
-- Name: APCNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-  Paper:
-    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
-    Title: Adaptive Pyramid Context Network for Semantic Segmentation
-  README: configs/apcnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/Junjun2016/APCNet
-Models:
-- Name: apcnet_r50-d8_512x1024_40k_cityscapes
-  In Collection: APCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 280.11
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.02
-      mIoU(ms+flip): 79.26
-  Config: configs/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes_20201214_115717-5e88fa33.pth
-- Name: apcnet_r101-d8_512x1024_40k_cityscapes
-  In Collection: APCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 465.12
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 11.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.08
-      mIoU(ms+flip): 80.34
-  Config: configs/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes_20201214_115716-abc9d111.pth
-- Name: apcnet_r50-d8_769x769_40k_cityscapes
-  In Collection: APCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 657.89
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 8.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.89
-      mIoU(ms+flip): 79.75
-  Config: configs/apcnet/apcnet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes_20201214_115717-2a2628d7.pth
-- Name: apcnet_r101-d8_769x769_40k_cityscapes
-  In Collection: APCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 970.87
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 12.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.96
-      mIoU(ms+flip): 79.24
-  Config: configs/apcnet/apcnet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes_20201214_115718-b650de90.pth
-- Name: apcnet_r50-d8_512x1024_80k_cityscapes
-  In Collection: APCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.96
-      mIoU(ms+flip): 79.94
-  Config: configs/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes_20201214_115716-987f51e3.pth
-- Name: apcnet_r101-d8_512x1024_80k_cityscapes
-  In Collection: APCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.64
-      mIoU(ms+flip): 80.61
-  Config: configs/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes_20201214_115705-b1ff208a.pth
-- Name: apcnet_r50-d8_769x769_80k_cityscapes
-  In Collection: APCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.79
-      mIoU(ms+flip): 80.35
-  Config: configs/apcnet/apcnet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes_20201214_115718-7ea9fa12.pth
-- Name: apcnet_r101-d8_769x769_80k_cityscapes
-  In Collection: APCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.45
-      mIoU(ms+flip): 79.91
-  Config: configs/apcnet/apcnet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes_20201214_115716-a7fbc2ab.pth
-- Name: apcnet_r50-d8_512x512_80k_ade20k
-  In Collection: APCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 50.99
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.2
-      mIoU(ms+flip): 43.3
-  Config: configs/apcnet/apcnet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k_20201214_115705-a8626293.pth
-- Name: apcnet_r101-d8_512x512_80k_ade20k
-  In Collection: APCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 76.34
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 13.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.54
-      mIoU(ms+flip): 46.65
-  Config: configs/apcnet/apcnet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k_20201214_115704-c656c3fb.pth
-- Name: apcnet_r50-d8_512x512_160k_ade20k
-  In Collection: APCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.4
-      mIoU(ms+flip): 43.94
-  Config: configs/apcnet/apcnet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k_20201214_115706-25fb92c2.pth
-- Name: apcnet_r101-d8_512x512_160k_ade20k
-  In Collection: APCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.41
-      mIoU(ms+flip): 46.63
-  Config: configs/apcnet/apcnet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k_20201214_115705-73f9a8d7.pth
diff --git a/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..754b2d1a08
--- /dev/null
+++ b/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..d2b5fe1360
--- /dev/null
+++ b/configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..03b018d2ff
--- /dev/null
+++ b/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..0cbbfadbdd
--- /dev/null
+++ b/configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..f0aacc06e0
--- /dev/null
+++ b/configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..219d07ae55
--- /dev/null
+++ b/configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './apcnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes.py b/configs/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index 1e1cec6735..0000000000
--- a/configs/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './apcnet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes.py b/configs/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 04cb006ba1..0000000000
--- a/configs/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './apcnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_512x512_160k_ade20k.py b/configs/apcnet/apcnet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 1ce2279a0f..0000000000
--- a/configs/apcnet/apcnet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './apcnet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_512x512_80k_ade20k.py b/configs/apcnet/apcnet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 8f10b98406..0000000000
--- a/configs/apcnet/apcnet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './apcnet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_769x769_40k_cityscapes.py b/configs/apcnet/apcnet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 5c44ebcaf3..0000000000
--- a/configs/apcnet/apcnet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './apcnet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r101-d8_769x769_80k_cityscapes.py b/configs/apcnet/apcnet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 616984575d..0000000000
--- a/configs/apcnet/apcnet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './apcnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes.py b/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/apcnet/apcnet_r50-d8_769x769_40k_cityscapes.py b/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/apcnet/apcnet_r50-d8_769x769_40k_cityscapes.py
rename to configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes.py b/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/apcnet/apcnet_r50-d8_769x769_80k_cityscapes.py b/configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/apcnet/apcnet_r50-d8_769x769_80k_cityscapes.py
rename to configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/apcnet/apcnet_r50-d8_512x512_160k_ade20k.py b/configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/apcnet/apcnet_r50-d8_512x512_160k_ade20k.py
rename to configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/apcnet/apcnet_r50-d8_512x512_80k_ade20k.py b/configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/apcnet/apcnet_r50-d8_512x512_80k_ade20k.py
rename to configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/apcnet/metafile.yaml b/configs/apcnet/metafile.yaml
new file mode 100644
index 0000000000..3f4072c8fd
--- /dev/null
+++ b/configs/apcnet/metafile.yaml
@@ -0,0 +1,296 @@
+Collections:
+- Name: APCNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  README: configs/apcnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: apcnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.02
+      mIoU(ms+flip): 79.26
+  Config: configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes_20201214_115717-5e88fa33.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_40k_cityscapes/apcnet_r50-d8_512x1024_40k_cityscapes-20201214_115717.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.08
+      mIoU(ms+flip): 80.34
+  Config: configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes_20201214_115716-abc9d111.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_40k_cityscapes/apcnet_r101-d8_512x1024_40k_cityscapes-20201214_115716.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.89
+      mIoU(ms+flip): 79.75
+  Config: configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes_20201214_115717-2a2628d7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_40k_cityscapes/apcnet_r50-d8_769x769_40k_cityscapes-20201214_115717.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.96
+      mIoU(ms+flip): 79.24
+  Config: configs/apcnet/apcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes_20201214_115718-b650de90.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_40k_cityscapes/apcnet_r101-d8_769x769_40k_cityscapes-20201214_115718.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.96
+      mIoU(ms+flip): 79.94
+  Config: configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes_20201214_115716-987f51e3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x1024_80k_cityscapes/apcnet_r50-d8_512x1024_80k_cityscapes-20201214_115716.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.64
+      mIoU(ms+flip): 80.61
+  Config: configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes_20201214_115705-b1ff208a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x1024_80k_cityscapes/apcnet_r101-d8_512x1024_80k_cityscapes-20201214_115705.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.79
+      mIoU(ms+flip): 80.35
+  Config: configs/apcnet/apcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes_20201214_115718-7ea9fa12.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_769x769_80k_cityscapes/apcnet_r50-d8_769x769_80k_cityscapes-20201214_115718.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.45
+      mIoU(ms+flip): 79.91
+  Config: configs/apcnet/apcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes_20201214_115716-a7fbc2ab.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_769x769_80k_cityscapes/apcnet_r101-d8_769x769_80k_cityscapes-20201214_115716.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.2
+      mIoU(ms+flip): 43.3
+  Config: configs/apcnet/apcnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k_20201214_115705-a8626293.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_80k_ade20k/apcnet_r50-d8_512x512_80k_ade20k-20201214_115705.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.54
+      mIoU(ms+flip): 46.65
+  Config: configs/apcnet/apcnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k_20201214_115704-c656c3fb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_80k_ade20k/apcnet_r101-d8_512x512_80k_ade20k-20201214_115704.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.4
+      mIoU(ms+flip): 43.94
+  Config: configs/apcnet/apcnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k_20201214_115706-25fb92c2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r50-d8_512x512_160k_ade20k/apcnet_r50-d8_512x512_160k_ade20k-20201214_115706.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: apcnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: APCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.41
+      mIoU(ms+flip): 46.63
+  Config: configs/apcnet/apcnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - APCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k_20201214_115705-73f9a8d7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/apcnet/apcnet_r101-d8_512x512_160k_ade20k/apcnet_r101-d8_512x512_160k_ade20k-20201214_115705.log.json
+  Paper:
+    Title: Adaptive Pyramid Context Network for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_CVPR_2019/html/He_Adaptive_Pyramid_Context_Network_for_Semantic_Segmentation_CVPR_2019_paper.html
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
diff --git a/configs/beit/README.md b/configs/beit/README.md
index 31e1bd6a83..b005c88c50 100644
--- a/configs/beit/README.md
+++ b/configs/beit/README.md
@@ -1,6 +1,6 @@
 # BEiT
 
-[BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254)
+> [BEiT: BERT Pre-Training of Image Transformers](https://arxiv.org/abs/2106.08254)
 
 ## Introduction
 
@@ -22,18 +22,6 @@ We introduce a self-supervised vision representation model BEiT, which stands fo
 <img src="https://user-images.githubusercontent.com/93248678/160155758-781c9a45-b1d7-4530-9015-88eca6645006.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@inproceedings{beit,
-      title={{BEiT}: {BERT} Pre-Training of Image Transformers},
-      author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
-      booktitle={International Conference on Learning Representations},
-      year={2022},
-      url={https://openreview.net/forum?id=p-BhZSz59o4}
-}
-```
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
@@ -79,7 +67,19 @@ upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth $GPUS --eval mIoU
 
 ### ADE20K
 
-| Method  | Backbone | Crop Size | pretrain     | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                 |
-| ------- | -------- | --------- | ------------ | ----------------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | ---------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UPerNet | BEiT-B   | 640x640   | ImageNet-22K | 224x224           | 16         | 160000  | 15.88    | 2.00           | 53.08 |         53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json)                         |
-| UPerNet | BEiT-L   | 640x640   | ImageNet-22K | 224x224           | 8          | 320000  | 22.64    | 0.96           | 56.33 |         56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json) |
+| Method  | Backbone | Crop Size | pretrain     | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------- | -------- | --------- | ------------ | ----------------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | BEiT-B   | 640x640   | ImageNet-22K | 224x224           | 16         | 160000  | 15.88    | 2.00           | V100   | 53.08 |         53.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json)                         |
+| UPerNet | BEiT-L   | 640x640   | ImageNet-22K | 224x224           | 8          | 320000  | 22.64    | 0.96           | V100   | 56.33 |         56.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json) |
+
+## Citation
+
+```bibtex
+@inproceedings{beit,
+      title={{BEiT}: {BERT} Pre-Training of Image Transformers},
+      author={Hangbo Bao and Li Dong and Songhao Piao and Furu Wei},
+      booktitle={International Conference on Learning Representations},
+      year={2022},
+      url={https://openreview.net/forum?id=p-BhZSz59o4}
+}
+```
diff --git a/configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py b/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py
similarity index 100%
rename from configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py
rename to configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py
diff --git a/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640_ms.py b/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640_ms.py
new file mode 100644
index 0000000000..02480222c4
--- /dev/null
+++ b/configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640_ms.py
@@ -0,0 +1,16 @@
+_base_ = './beit-base_upernet_8xb2-160k_ade20k-640x640.py'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # TODO: Refactor 'MultiScaleFlipAug' which supports
+    # `min_size` feature in `Resize` class
+    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    # original image scale is (2560, 640)
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs'),
+]
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py b/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py
similarity index 100%
rename from configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py
rename to configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py
diff --git a/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640_ms.py b/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640_ms.py
new file mode 100644
index 0000000000..fc6f049d11
--- /dev/null
+++ b/configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640_ms.py
@@ -0,0 +1,16 @@
+_base_ = './beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # TODO: Refactor 'MultiScaleFlipAug' which supports
+    # `min_size` feature in `Resize` class
+    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    # original image scale is (2560, 640)
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs'),
+]
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/configs/beit/beit.yml b/configs/beit/beit.yml
deleted file mode 100644
index 602a887d40..0000000000
--- a/configs/beit/beit.yml
+++ /dev/null
@@ -1,45 +0,0 @@
-Models:
-- Name: upernet_beit-base_8x2_640x640_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: BEiT-B
-    crop size: (640,640)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 500.0
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (640,640)
-    Training Memory (GB): 15.88
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 53.08
-      mIoU(ms+flip): 53.84
-  Config: configs/beit/upernet_beit-base_8x2_640x640_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth
-- Name: upernet_beit-large_fp16_8x1_640x640_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: BEiT-L
-    crop size: (640,640)
-    lr schd: 320000
-    inference time (ms/im):
-    - value: 1041.67
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (640,640)
-    Training Memory (GB): 22.64
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 56.33
-      mIoU(ms+flip): 56.84
-  Config: configs/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth
diff --git a/configs/beit/metafile.yaml b/configs/beit/metafile.yaml
new file mode 100644
index 0000000000..ef6124e8dc
--- /dev/null
+++ b/configs/beit/metafile.yaml
@@ -0,0 +1,49 @@
+Models:
+- Name: beit-base_upernet_8xb2-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.08
+      mIoU(ms+flip): 53.84
+  Config: configs/beit/beit-base_upernet_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - BEiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 15.88
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k-eead221d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-base_8x2_640x640_160k_ade20k/upernet_beit-base_8x2_640x640_160k_ade20k.log.json
+  Paper:
+    Title: 'BEiT: BERT Pre-Training of Image Transformers'
+    URL: https://arxiv.org/abs/2106.08254
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/backbones/beit.py#1404
+  Framework: PyTorch
+- Name: beit-large_upernet_8xb1-amp-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 56.33
+      mIoU(ms+flip): 56.84
+  Config: configs/beit/beit-large_upernet_8xb1-amp-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - BEiT-L
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 22.64
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k-8fc0dd5d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/beit/upernet_beit-large_fp16_8x1_640x640_160k_ade20k/upernet_beit-large_fp16_8x1_640x640_160k_ade20k.log.json
+  Paper:
+    Title: 'BEiT: BERT Pre-Training of Image Transformers'
+    URL: https://arxiv.org/abs/2106.08254
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/backbones/beit.py#1404
+  Framework: PyTorch
diff --git a/configs/beit/upernet_beit-base_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit-base_640x640_160k_ade20k_ms.py
deleted file mode 100644
index 323cdb13be..0000000000
--- a/configs/beit/upernet_beit-base_640x640_160k_ade20k_ms.py
+++ /dev/null
@@ -1,16 +0,0 @@
-_base_ = './upernet_beit-base_8x2_640x640_160k_ade20k.py'
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    # TODO: Refactor 'MultiScaleFlipAug' which supports
-    # `min_size` feature in `Resize` class
-    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
-    # original image scale is (2560, 640)
-    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
-    # add loading annotation after ``Resize`` because ground truth
-    # does not need to do resize data transform
-    dict(type='LoadAnnotations', reduce_zero_label=True),
-    dict(type='PackSegInputs'),
-]
-val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
-test_dataloader = val_dataloader
diff --git a/configs/beit/upernet_beit-large_fp16_640x640_160k_ade20k_ms.py b/configs/beit/upernet_beit-large_fp16_640x640_160k_ade20k_ms.py
deleted file mode 100644
index 279e7ace26..0000000000
--- a/configs/beit/upernet_beit-large_fp16_640x640_160k_ade20k_ms.py
+++ /dev/null
@@ -1,16 +0,0 @@
-_base_ = './upernet_beit-large_fp16_8x1_640x640_160k_ade20k.py'
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    # TODO: Refactor 'MultiScaleFlipAug' which supports
-    # `min_size` feature in `Resize` class
-    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
-    # original image scale is (2560, 640)
-    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
-    # add loading annotation after ``Resize`` because ground truth
-    # does not need to do resize data transform
-    dict(type='LoadAnnotations', reduce_zero_label=True),
-    dict(type='PackSegInputs'),
-]
-val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
-test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/README.md b/configs/bisenetv1/README.md
index 58092d6bcc..a5058957f0 100644
--- a/configs/bisenetv1/README.md
+++ b/configs/bisenetv1/README.md
@@ -1,6 +1,6 @@
 # BiSeNetV1
 
-[BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation](https://arxiv.org/abs/1808.00897)
+> [BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation](https://arxiv.org/abs/1808.00897)
 
 ## Introduction
 
@@ -22,43 +22,43 @@ Semantic segmentation requires both rich spatial information and sizeable recept
 <img src="https://user-images.githubusercontent.com/24582831/142898839-a0a78148-848a-41b2-8682-b1f61ac004ba.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@inproceedings{yu2018bisenet,
-  title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
-  author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong},
-  booktitle={Proceedings of the European conference on computer vision (ECCV)},
-  pages={325--341},
-  year={2018}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method                  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                                                                                               |
-| ----------------------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| BiSeNetV1 (No Pretrain) | R-18-D32 | 1024x1024 |  160000 | 5.69     | 31.77          | 74.44 | 77.05         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json)                                     |
-| BiSeNetV1               | R-18-D32 | 1024x1024 |  160000 | 5.69     | 31.77          | 74.37 | 76.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
-| BiSeNetV1 (4x8)         | R-18-D32 | 1024x1024 |  160000 | 11.17    | 31.77          | 75.16 | 77.24         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
-| BiSeNetV1 (No Pretrain) | R-50-D32 | 1024x1024 |  160000 | 15.39    | 7.71           | 76.92 | 78.87         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json)                                     |
-| BiSeNetV1               | R-50-D32 | 1024x1024 |  160000 | 15.39    | 7.71           | 77.68 | 79.57         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
+| Method    | Backbone               | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| --------- | ---------------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| BiSeNetV1 | R-18-D32 (No Pretrain) | 1024x1024 |  160000 | 5.69     | 31.77          | V100   | 74.44 | 77.05         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json)                                     |
+| BiSeNetV1 | R-18-D32               | 1024x1024 |  160000 | 5.69     | 31.77          | V100   | 74.37 | 76.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json) |
+| BiSeNetV1 | R-18-D32 (4x8)         | 1024x1024 |  160000 | 11.17    | 31.77          | V100   | 75.16 | 77.24         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json) |
+| BiSeNetV1 | R-50-D32 (No Pretrain) | 1024x1024 |  160000 | 15.39    | 7.71           | V100   | 76.92 | 78.87         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json)                                     |
+| BiSeNetV1 | R-50-D32               | 1024x1024 |  160000 | 15.39    | 7.71           | V100   | 77.68 | 79.57         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json) |
 
 ### COCO-Stuff 164k
 
-| Method                  | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| ----------------------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| BiSeNetV1 (No Pretrain) | R-18-D32  | 512x512   |  160000 | -        | -              | 25.45 | 26.15         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328-046aa2f2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328.log.json)                                         |
-| BiSeNetV1               | R-18-D32  | 512x512   |  160000 | 6.33     | 74.24          | 28.55 | 29.26         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100-f700dbf7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100.log.json)     |
-| BiSeNetV1 (No Pretrain) | R-50-D32  | 512x512   |  160000 | -        | -              | 29.82 | 30.33         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616-d2bb0df4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616.log.json)                                         |
-| BiSeNetV1               | R-50-D32  | 512x512   |  160000 | 9.28     | 32.60          | 34.88 | 35.37         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932-66747911.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932.log.json)     |
-| BiSeNetV1 (No Pretrain) | R-101-D32 | 512x512   |  160000 | -        | -              | 31.14 | 31.76         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147-c6b32c3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147.log.json)                                     |
-| BiSeNetV1               | R-101-D32 | 512x512   |  160000 | 10.36    | 25.25          | 37.38 | 37.99         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220-28c8f092.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220.log.json) |
+| Method    | Backbone                | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| --------- | ----------------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| BiSeNetV1 | R-18-D32 (No Pretrain)  | 512x512   |  160000 | -        | -              | V100   | 25.45 | 26.15         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328-046aa2f2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328.log.json)                                         |
+| BiSeNetV1 | R-18-D32                | 512x512   |  160000 | 6.33     | 74.24          | V100   | 28.55 | 29.26         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100-f700dbf7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100.log.json)     |
+| BiSeNetV1 | R-50-D32 (No Pretrain)  | 512x512   |  160000 | -        | -              | V100   | 29.82 | 30.33         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616-d2bb0df4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616.log.json)                                         |
+| BiSeNetV1 | R-50-D32                | 512x512   |  160000 | 9.28     | 32.60          | V100   | 34.88 | 35.37         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932-66747911.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932.log.json)     |
+| BiSeNetV1 | R-101-D32 (No Pretrain) | 512x512   |  160000 | -        | -              | V100   | 31.14 | 31.76         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147-c6b32c3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147.log.json)                                     |
+| BiSeNetV1 | R-101-D32               | 512x512   |  160000 | 10.36    | 25.25          | V100   | 37.38 | 37.99         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220-28c8f092.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220.log.json) |
 
 Note:
 
 - `4x8`: Using 4 GPUs with 8 samples per GPU in training.
 - For BiSeNetV1 on Cityscapes dataset, default setting is 4 GPUs with 4 samples per GPU in training.
 - `No Pretrain` means the model is trained from scratch.
+
+## Citation
+
+```bibtex
+@inproceedings{yu2018bisenet,
+  title={Bisenet: Bilateral segmentation network for real-time semantic segmentation},
+  author={Yu, Changqian and Wang, Jingbo and Peng, Chao and Gao, Changxin and Yu, Gang and Sang, Nong},
+  booktitle={Proceedings of the European conference on computer vision (ECCV)},
+  pages={325--341},
+  year={2018}
+}
+```
diff --git a/configs/bisenetv1/bisenetv1.yml b/configs/bisenetv1/bisenetv1.yml
deleted file mode 100644
index 61f264b056..0000000000
--- a/configs/bisenetv1/bisenetv1.yml
+++ /dev/null
@@ -1,234 +0,0 @@
-Collections:
-- Name: BiSeNetV1
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - COCO-Stuff 164k
-  Paper:
-    URL: https://arxiv.org/abs/1808.00897
-    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
-  README: configs/bisenetv1/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
-    Version: v0.18.0
-  Converted From:
-    Code: https://github.com/ycszen/TorchSeg/tree/master/model/bisenet
-Models:
-- Name: bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-18-D32
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 31.48
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 5.69
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.44
-      mIoU(ms+flip): 77.05
-  Config: configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth
-- Name: bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-18-D32
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 31.48
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 5.69
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.37
-      mIoU(ms+flip): 76.91
-  Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth
-- Name: bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-18-D32
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 31.48
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 11.17
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.16
-      mIoU(ms+flip): 77.24
-  Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth
-- Name: bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-50-D32
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 129.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 15.39
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.92
-      mIoU(ms+flip): 78.87
-  Config: configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth
-- Name: bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-50-D32
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 129.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 15.39
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.68
-      mIoU(ms+flip): 79.57
-  Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth
-- Name: bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-18-D32
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 25.45
-      mIoU(ms+flip): 26.15
-  Config: configs/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328-046aa2f2.pth
-- Name: bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-18-D32
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 13.47
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.33
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 28.55
-      mIoU(ms+flip): 29.26
-  Config: configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100-f700dbf7.pth
-- Name: bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 29.82
-      mIoU(ms+flip): 30.33
-  Config: configs/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616-d2bb0df4.pth
-- Name: bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 30.67
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.28
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 34.88
-      mIoU(ms+flip): 35.37
-  Config: configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932-66747911.pth
-- Name: bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-101-D32
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 31.14
-      mIoU(ms+flip): 31.76
-  Config: configs/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147-c6b32c3b.pth
-- Name: bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k
-  In Collection: BiSeNetV1
-  Metadata:
-    backbone: R-101-D32
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 39.6
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.36
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 37.38
-      mIoU(ms+flip): 37.99
-  Config: configs/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220-28c8f092.pth
diff --git a/configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py b/configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..ac63447d47
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './bisenetv1_r101-d32_4xb4-160k_coco-stuff164k-512x512.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/configs/bisenetv1/bisenetv1_r101-d32_4xb4-160k_coco-stuff164k-512x512.py b/configs/bisenetv1/bisenetv1_r101-d32_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..02e4e9be05
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r101-d32_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        context_channels=(512, 1024, 2048),
+        spatial_channels=(256, 256, 256, 512),
+        out_channels=1024,
+        backbone_cfg=dict(type='ResNet', depth=101)),
+    decode_head=dict(in_channels=1024, channels=1024, num_classes=171),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=171,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=171,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py b/configs/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
deleted file mode 100644
index c3fe21597d..0000000000
--- a/configs/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py'
-model = dict(
-    backbone=dict(
-        backbone_cfg=dict(
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/configs/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py b/configs/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
deleted file mode 100644
index c52fffd53f..0000000000
--- a/configs/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
+++ /dev/null
@@ -1,35 +0,0 @@
-_base_ = [
-    '../_base_/models/bisenetv1_r18-d32.py',
-    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
-    '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (512, 512)
-data_preprocessor = dict(size=crop_size)
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        context_channels=(512, 1024, 2048),
-        spatial_channels=(256, 256, 256, 512),
-        out_channels=1024,
-        backbone_cfg=dict(type='ResNet', depth=101)),
-    decode_head=dict(in_channels=1024, channels=1024, num_classes=171),
-    auxiliary_head=[
-        dict(in_channels=512, channels=256, num_classes=171),
-        dict(in_channels=512, channels=256, num_classes=171),
-    ])
-param_scheduler = [
-    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
-    dict(
-        type='PolyLR',
-        eta_min=1e-4,
-        power=0.9,
-        begin=1000,
-        end=160000,
-        by_epoch=False,
-    )
-]
-optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
-optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
-train_dataloader = dict(batch_size=4, num_workers=4)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
similarity index 100%
rename from configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
rename to configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
diff --git a/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py b/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..9de889f001
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py'
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))),
+)
diff --git a/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py b/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..0580ce11e6
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py
@@ -0,0 +1,4 @@
+_base_ = './bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py'
+train_dataloader = dict(batch_size=8, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py
similarity index 100%
rename from configs/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes.py
rename to configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py b/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..2109d689d0
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,53 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=171),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=171,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=128,
+            channels=64,
+            num_convs=1,
+            num_classes=171,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
deleted file mode 100644
index d37b3c5d20..0000000000
--- a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py'
-train_dataloader = dict(batch_size=8, num_workers=4)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py b/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
deleted file mode 100644
index 7b686add5c..0000000000
--- a/configs/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py'
-crop_size = (512, 512)
-data_preprocessor = dict(size=crop_size)
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        backbone_cfg=dict(
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))),
-)
diff --git a/configs/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py b/configs/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
deleted file mode 100644
index edd516a2fc..0000000000
--- a/configs/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
+++ /dev/null
@@ -1,30 +0,0 @@
-_base_ = [
-    '../_base_/models/bisenetv1_r18-d32.py',
-    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
-    '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (512, 512)
-data_preprocessor = dict(size=crop_size)
-model = dict(
-    data_preprocessor=data_preprocessor,
-    decode_head=dict(num_classes=171),
-    auxiliary_head=[
-        dict(num_classes=171),
-        dict(num_classes=171),
-    ])
-param_scheduler = [
-    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
-    dict(
-        type='PolyLR',
-        eta_min=1e-4,
-        power=0.9,
-        begin=1000,
-        end=160000,
-        by_epoch=False,
-    )
-]
-optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
-optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
-train_dataloader = dict(batch_size=4, num_workers=4)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py b/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..013c4ff162
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
@@ -0,0 +1,7 @@
+_base_ = './bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py'
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py b/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..b35259c725
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = './bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py'
+
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py
similarity index 100%
rename from configs/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py
rename to configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py b/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..8b6ef74c1a
--- /dev/null
+++ b/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/bisenetv1_r18-d32.py',
+    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        context_channels=(512, 1024, 2048),
+        spatial_channels=(256, 256, 256, 512),
+        out_channels=1024,
+        backbone_cfg=dict(type='ResNet', depth=50)),
+    decode_head=dict(in_channels=1024, channels=1024, num_classes=171),
+    auxiliary_head=[
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=171,
+            in_index=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+        dict(
+            type='FCNHead',
+            in_channels=512,
+            channels=256,
+            num_convs=1,
+            num_classes=171,
+            in_index=2,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    ])
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
deleted file mode 100644
index 5625a76c08..0000000000
--- a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,7 +0,0 @@
-_base_ = './bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes.py'
-model = dict(
-    type='EncoderDecoder',
-    backbone=dict(
-        backbone_cfg=dict(
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py b/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
deleted file mode 100644
index f0fea69f2f..0000000000
--- a/configs/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
+++ /dev/null
@@ -1,7 +0,0 @@
-_base_ = './bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py'
-
-model = dict(
-    backbone=dict(
-        backbone_cfg=dict(
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/configs/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py b/configs/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
deleted file mode 100644
index 07900720c1..0000000000
--- a/configs/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k.py
+++ /dev/null
@@ -1,35 +0,0 @@
-_base_ = [
-    '../_base_/models/bisenetv1_r18-d32.py',
-    '../_base_/datasets/coco-stuff164k.py', '../_base_/default_runtime.py',
-    '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (512, 512)
-data_preprocessor = dict(size=crop_size)
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        context_channels=(512, 1024, 2048),
-        spatial_channels=(256, 256, 256, 512),
-        out_channels=1024,
-        backbone_cfg=dict(type='ResNet', depth=50)),
-    decode_head=dict(in_channels=1024, channels=1024, num_classes=171),
-    auxiliary_head=[
-        dict(in_channels=512, channels=256, num_classes=171),
-        dict(in_channels=512, channels=256, num_classes=171),
-    ])
-param_scheduler = [
-    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
-    dict(
-        type='PolyLR',
-        eta_min=1e-4,
-        power=0.9,
-        begin=1000,
-        end=160000,
-        by_epoch=False,
-    )
-]
-optimizer = dict(type='SGD', lr=0.005, momentum=0.9, weight_decay=0.0005)
-optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
-train_dataloader = dict(batch_size=4, num_workers=4)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/bisenetv1/metafile.yaml b/configs/bisenetv1/metafile.yaml
new file mode 100644
index 0000000000..e37f632b2f
--- /dev/null
+++ b/configs/bisenetv1/metafile.yaml
@@ -0,0 +1,275 @@
+Collections:
+- Name: BiSeNetV1
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - COCO-Stuff 164k
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  README: configs/bisenetv1/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.44
+      mIoU(ms+flip): 77.05
+  Config: configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.69
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239-c55e78e2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_4x4_1024x1024_160k_cityscapes_20210922_172239.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.37
+      mIoU(ms+flip): 76.91
+  Config: configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.69
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251-8ba80eff.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210905_220251.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.16
+      mIoU(ms+flip): 77.24
+  Config: configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb8-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 32
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.17
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322-bb8db75f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes/bisenetv1_r18-d32_in1k-pre_4x8_1024x1024_160k_cityscapes_20210905_220322.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.92
+      mIoU(ms+flip): 78.87
+  Config: configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.39
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639-7b28a2a6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_4x4_1024x1024_160k_cityscapes_20210923_222639.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.68
+      mIoU(ms+flip): 79.57
+  Config: configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.39
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628-8b304447.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes/bisenetv1_r50-d32_in1k-pre_4x4_1024x1024_160k_cityscapes_20210917_234628.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 25.45
+      mIoU(ms+flip): 26.15
+  Config: configs/bisenetv1/bisenetv1_r18-d32_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328-046aa2f2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211022_054328.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 28.55
+      mIoU(ms+flip): 29.26
+  Config: configs/bisenetv1/bisenetv1_r18-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-18-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.33
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100-f700dbf7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r18-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211023_013100.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 29.82
+      mIoU(ms+flip): 30.33
+  Config: configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616-d2bb0df4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_040616.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 34.88
+      mIoU(ms+flip): 35.37
+  Config: configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.28
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932-66747911.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r50-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_181932.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 31.14
+      mIoU(ms+flip): 31.76
+  Config: configs/bisenetv1/bisenetv1_r50-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147-c6b32c3b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211102_164147.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
+- Name: bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512
+  In Collection: BiSeNetV1
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 37.38
+      mIoU(ms+flip): 37.99
+  Config: configs/bisenetv1/bisenetv1_r101-d32-in1k-pre_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - BiSeNetV1
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.36
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220-28c8f092.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv1/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k/bisenetv1_r101-d32_in1k-pre_lr5e-3_4x4_512x512_160k_coco-stuff164k_20211101_225220.log.json
+  Paper:
+    Title: 'BiSeNet: Bilateral Segmentation Network for Real-time Semantic Segmentation'
+    URL: https://arxiv.org/abs/1808.00897
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv1.py#L266
+  Framework: PyTorch
diff --git a/configs/bisenetv2/README.md b/configs/bisenetv2/README.md
index 6b74b7ee41..a5871dfeb9 100644
--- a/configs/bisenetv2/README.md
+++ b/configs/bisenetv2/README.md
@@ -1,6 +1,6 @@
 # BiSeNetV2
 
-[Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation](https://arxiv.org/abs/2004.02147)
+> [Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic Segmentation](https://arxiv.org/abs/2004.02147)
 
 ## Introduction
 
@@ -22,6 +22,23 @@ The low-level details and high-level semantics are both essential to the semanti
 <img src="https://user-images.githubusercontent.com/24582831/142898966-ec4a81da-b4b0-41ee-b083-1d964582c18a.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method    | Backbone         | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                                               |
+| --------- | ---------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| BiSeNetV2 | BiSeNetV2        | 1024x1024 |  160000 | 7.64     | 31.77          | V100   | 73.21 |         75.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551-bcf10f09.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551.log.json)                     |
+| BiSeNetV2 | BiSeNetV2 (OHEM) | 1024x1024 |  160000 | 7.64     | -              | V100   | 73.57 |         75.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947-5f8103b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947.log.json) |
+| BiSeNetV2 | BiSeNetV2 (4x8)  | 1024x1024 |  160000 | 15.05    | -              | V100   | 75.76 |         77.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032-e1a2eed6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032.log.json)                     |
+| BiSeNetV2 | BiSeNetV2 (FP16) | 1024x1024 |  160000 | 5.77     | 36.65          | V100   | 73.07 |         75.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942-b979777b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942.log.json) |
+
+Note:
+
+- `OHEM` means Online Hard Example Mining (OHEM) is adopted in training.
+- `FP16` means Mixed Precision (FP16) is adopted in training.
+- `4x8` means 4 GPUs with 8 samples per GPU in training.
+
 ## Citation
 
 ```bibtex
@@ -34,20 +51,3 @@ The low-level details and high-level semantics are both essential to the semanti
   publisher={Springer}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method           | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                               |
-| ---------------- | --------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| BiSeNetV2        | BiSeNetV2 | 1024x1024 |  160000 | 7.64     | 31.77          | 73.21 |         75.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551-bcf10f09.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551.log.json)                     |
-| BiSeNetV2 (OHEM) | BiSeNetV2 | 1024x1024 |  160000 | 7.64     | -              | 73.57 |         75.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947-5f8103b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947.log.json) |
-| BiSeNetV2 (4x8)  | BiSeNetV2 | 1024x1024 |  160000 | 15.05    | -              | 75.76 |         77.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032-e1a2eed6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032.log.json)                     |
-| BiSeNetV2 (FP16) | BiSeNetV2 | 1024x1024 |  160000 | 5.77     | 36.65          | 73.07 |         75.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942-b979777b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942.log.json) |
-
-Note:
-
-- `OHEM` means Online Hard Example Mining (OHEM) is adopted in training.
-- `FP16` means Mixed Precision (FP16) is adopted in training.
-- `4x8` means 4 GPUs with 8 samples per GPU in training.
diff --git a/configs/bisenetv2/bisenetv2.yml b/configs/bisenetv2/bisenetv2.yml
deleted file mode 100644
index 455fa6c479..0000000000
--- a/configs/bisenetv2/bisenetv2.yml
+++ /dev/null
@@ -1,88 +0,0 @@
-Collections:
-- Name: BiSeNetV2
-  Metadata:
-    Training Data:
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/2004.02147
-    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
-      Segmentation'
-  README: configs/bisenetv2/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
-    Version: v0.18.0
-Models:
-- Name: bisenetv2_fcn_4x4_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV2
-  Metadata:
-    backbone: BiSeNetV2
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 31.48
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 7.64
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.21
-      mIoU(ms+flip): 75.74
-  Config: configs/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551-bcf10f09.pth
-- Name: bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV2
-  Metadata:
-    backbone: BiSeNetV2
-    crop size: (1024,1024)
-    lr schd: 160000
-    Training Memory (GB): 7.64
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.57
-      mIoU(ms+flip): 75.8
-  Config: configs/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947-5f8103b4.pth
-- Name: bisenetv2_fcn_4x8_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV2
-  Metadata:
-    backbone: BiSeNetV2
-    crop size: (1024,1024)
-    lr schd: 160000
-    Training Memory (GB): 15.05
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.76
-      mIoU(ms+flip): 77.79
-  Config: configs/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032-e1a2eed6.pth
-- Name: bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes
-  In Collection: BiSeNetV2
-  Metadata:
-    backbone: BiSeNetV2
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 27.29
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (1024,1024)
-    Training Memory (GB): 5.77
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.07
-      mIoU(ms+flip): 75.13
-  Config: configs/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942-b979777b.pth
diff --git a/configs/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py
similarity index 100%
rename from configs/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes.py
rename to configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py
diff --git a/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py b/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..8ed338c00b
--- /dev/null
+++ b/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py
@@ -0,0 +1,6 @@
+_base_ = './bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py'
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0005),
+    loss_scale=512.)
diff --git a/configs/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py
similarity index 100%
rename from configs/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes.py
rename to configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py
diff --git a/configs/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes.py b/configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py
similarity index 100%
rename from configs/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes.py
rename to configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py
diff --git a/configs/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes.py b/configs/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes.py
deleted file mode 100644
index 1bdb60b846..0000000000
--- a/configs/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './bisenetv2_fcn_4x4_1024x1024_160k_cityscapes.py'
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=dict(type='SGD', lr=0.05, momentum=0.9, weight_decay=0.0005),
-    loss_scale=512.)
diff --git a/configs/bisenetv2/metafile.yaml b/configs/bisenetv2/metafile.yaml
new file mode 100644
index 0000000000..5430ec3071
--- /dev/null
+++ b/configs/bisenetv2/metafile.yaml
@@ -0,0 +1,114 @@
+Collections:
+- Name: BiSeNetV2
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  README: configs/bisenetv2/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV2
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.21
+      mIoU(ms+flip): 75.74
+  Config: configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - BiSeNetV2
+    - BiSeNetV2
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.64
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551-bcf10f09.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_4x4_1024x1024_160k_cityscapes_20210902_015551.log.json
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
+  Framework: PyTorch
+- Name: bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV2
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.57
+      mIoU(ms+flip): 75.8
+  Config: configs/bisenetv2/bisenetv2_fcn_4xb4-ohem-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - BiSeNetV2
+    - BiSeNetV2
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.64
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947-5f8103b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_ohem_4x4_1024x1024_160k_cityscapes_20210902_112947.log.json
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
+  Framework: PyTorch
+- Name: bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV2
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.76
+      mIoU(ms+flip): 77.79
+  Config: configs/bisenetv2/bisenetv2_fcn_4xb8-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 32
+    Architecture:
+    - BiSeNetV2
+    - BiSeNetV2
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.05
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032-e1a2eed6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes/bisenetv2_fcn_4x8_1024x1024_160k_cityscapes_20210903_000032.log.json
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
+  Framework: PyTorch
+- Name: bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024
+  In Collection: BiSeNetV2
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.07
+      mIoU(ms+flip): 75.13
+  Config: configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - BiSeNetV2
+    - BiSeNetV2
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.77
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942-b979777b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes_20210902_045942.log.json
+  Paper:
+    Title: 'Bisenet v2: Bilateral Network with Guided Aggregation for Real-time Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2004.02147
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/backbones/bisenetv2.py#L545
+  Framework: PyTorch
diff --git a/configs/ccnet/README.md b/configs/ccnet/README.md
index 48c37a8e53..64dd5f0298 100644
--- a/configs/ccnet/README.md
+++ b/configs/ccnet/README.md
@@ -1,6 +1,6 @@
 # CCNet
 
-[CCNet: Criss-Cross Attention for Semantic Segmentation](https://arxiv.org/abs/1811.11721)
+> [CCNet: Criss-Cross Attention for Semantic Segmentation](https://arxiv.org/abs/1811.11721)
 
 ## Introduction
 
@@ -22,46 +22,46 @@ Contextual information is vital in visual understanding problems, such as semant
 <img src="https://user-images.githubusercontent.com/24582831/142899159-b329c12a-0fde-44df-8718-def6cfb004e4.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{huang2018ccnet,
-    title={CCNet: Criss-Cross Attention for Semantic Segmentation},
-    author={Huang, Zilong and Wang, Xinggang and Huang, Lichao and Huang, Chang and Wei, Yunchao and Liu, Wenyu},
-    booktitle={ICCV},
-    year={2019}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CCNet  | R-50-D8  | 512x1024  |   40000 | 6        | 3.32           | 77.76 |         78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517-4123f401.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517.log.json)     |
-| CCNet  | R-101-D8 | 512x1024  |   40000 | 9.5      | 2.31           | 76.35 |         78.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540-a3b84ba6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540.log.json) |
-| CCNet  | R-50-D8  | 769x769   |   40000 | 6.8      | 1.43           | 78.46 |         79.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125-76d11884.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125.log.json)         |
-| CCNet  | R-101-D8 | 769x769   |   40000 | 10.7     | 1.01           | 76.94 |         78.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428-4f57c8d0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428.log.json)     |
-| CCNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | 79.03 |         80.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421-869a3423.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421.log.json)     |
-| CCNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | 78.87 |         79.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935-ffae8917.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935.log.json) |
-| CCNet  | R-50-D8  | 769x769   |   80000 | -        | -              | 79.29 |         81.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421-73eed8ca.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421.log.json)         |
-| CCNet  | R-101-D8 | 769x769   |   80000 | -        | -              | 79.45 |         80.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502-ad3cd481.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502.log.json)     |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CCNet  | R-50-D8  | 512x1024  |   40000 | 6        | 3.32           | V100   | 77.76 |         78.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517-4123f401.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517.log.json)     |
+| CCNet  | R-101-D8 | 512x1024  |   40000 | 9.5      | 2.31           | V100   | 76.35 |         78.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540-a3b84ba6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540.log.json) |
+| CCNet  | R-50-D8  | 769x769   |   40000 | 6.8      | 1.43           | V100   | 78.46 |         79.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125-76d11884.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125.log.json)         |
+| CCNet  | R-101-D8 | 769x769   |   40000 | 10.7     | 1.01           | V100   | 76.94 |         78.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428-4f57c8d0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428.log.json)     |
+| CCNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 79.03 |         80.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421-869a3423.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421.log.json)     |
+| CCNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 78.87 |         79.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935-ffae8917.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935.log.json) |
+| CCNet  | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.29 |         81.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421-73eed8ca.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421.log.json)         |
+| CCNet  | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.45 |         80.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502-ad3cd481.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502.log.json)     |
 
 ### ADE20K
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CCNet  | R-50-D8  | 512x512   |   80000 | 8.8      | 20.89          | 41.78 |         42.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848-aa37f61e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848.log.json)         |
-| CCNet  | R-101-D8 | 512x512   |   80000 | 12.2     | 14.11          | 43.97 |         45.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848-1f4929a3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848.log.json)     |
-| CCNet  | R-50-D8  | 512x512   |  160000 | -        | -              | 42.08 |         43.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435-7c97193b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435.log.json)     |
-| CCNet  | R-101-D8 | 512x512   |  160000 | -        | -              | 43.71 |         45.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644-e849e007.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CCNet  | R-50-D8  | 512x512   |   80000 | 8.8      | 20.89          | V100   | 41.78 |         42.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848-aa37f61e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848.log.json)         |
+| CCNet  | R-101-D8 | 512x512   |   80000 | 12.2     | 14.11          | V100   | 43.97 |         45.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848-1f4929a3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848.log.json)     |
+| CCNet  | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.08 |         43.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435-7c97193b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435.log.json)     |
+| CCNet  | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 43.71 |         45.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644-e849e007.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644.log.json) |
 
 ### Pascal VOC 2012 + Aug
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                               |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| CCNet  | R-50-D8  | 512x512   |   20000 | 6        | 20.45          | 76.17 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212-fad81784.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212.log.json)     |
-| CCNet  | R-101-D8 | 512x512   |   20000 | 9.5      | 13.64          | 77.27 |         79.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212-0007b61d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212.log.json) |
-| CCNet  | R-50-D8  | 512x512   |   40000 | -        | -              | 75.96 |         77.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127-c2a15f02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127.log.json)     |
-| CCNet  | R-101-D8 | 512x512   |   40000 | -        | -              | 77.87 |         78.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet/ccnet_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127-c30da577.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| CCNet  | R-50-D8  | 512x512   |   20000 | 6        | 20.45          | V100   | 76.17 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212-fad81784.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212.log.json)     |
+| CCNet  | R-101-D8 | 512x512   |   20000 | 9.5      | 13.64          | V100   | 77.27 |         79.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212-0007b61d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212.log.json) |
+| CCNet  | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 75.96 |         77.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127-c2a15f02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127.log.json)     |
+| CCNet  | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 77.87 |         78.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127-c30da577.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127.log.json) |
+
+## Citation
+
+```bibtex
+@article{huang2018ccnet,
+    title={CCNet: Criss-Cross Attention for Semantic Segmentation},
+    author={Huang, Zilong and Wang, Xinggang and Huang, Lichao and Huang, Chang and Wei, Yunchao and Liu, Wenyu},
+    booktitle={ICCV},
+    year={2019}
+}
+```
diff --git a/configs/ccnet/ccnet.yml b/configs/ccnet/ccnet.yml
deleted file mode 100644
index b264f2e6c2..0000000000
--- a/configs/ccnet/ccnet.yml
+++ /dev/null
@@ -1,305 +0,0 @@
-Collections:
-- Name: CCNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://arxiv.org/abs/1811.11721
-    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
-  README: configs/ccnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/speedinghzl/CCNet
-Models:
-- Name: ccnet_r50-d8_512x1024_40k_cityscapes
-  In Collection: CCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 301.2
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.76
-      mIoU(ms+flip): 78.87
-  Config: configs/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517-4123f401.pth
-- Name: ccnet_r101-d8_512x1024_40k_cityscapes
-  In Collection: CCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 432.9
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.35
-      mIoU(ms+flip): 78.19
-  Config: configs/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540-a3b84ba6.pth
-- Name: ccnet_r50-d8_769x769_40k_cityscapes
-  In Collection: CCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 699.3
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.46
-      mIoU(ms+flip): 79.93
-  Config: configs/ccnet/ccnet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125-76d11884.pth
-- Name: ccnet_r101-d8_769x769_40k_cityscapes
-  In Collection: CCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 990.1
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.94
-      mIoU(ms+flip): 78.62
-  Config: configs/ccnet/ccnet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428-4f57c8d0.pth
-- Name: ccnet_r50-d8_512x1024_80k_cityscapes
-  In Collection: CCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.03
-      mIoU(ms+flip): 80.16
-  Config: configs/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421-869a3423.pth
-- Name: ccnet_r101-d8_512x1024_80k_cityscapes
-  In Collection: CCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.87
-      mIoU(ms+flip): 79.9
-  Config: configs/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935-ffae8917.pth
-- Name: ccnet_r50-d8_769x769_80k_cityscapes
-  In Collection: CCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.29
-      mIoU(ms+flip): 81.08
-  Config: configs/ccnet/ccnet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421-73eed8ca.pth
-- Name: ccnet_r101-d8_769x769_80k_cityscapes
-  In Collection: CCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.45
-      mIoU(ms+flip): 80.66
-  Config: configs/ccnet/ccnet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502-ad3cd481.pth
-- Name: ccnet_r50-d8_512x512_80k_ade20k
-  In Collection: CCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 47.87
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.78
-      mIoU(ms+flip): 42.98
-  Config: configs/ccnet/ccnet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848-aa37f61e.pth
-- Name: ccnet_r101-d8_512x512_80k_ade20k
-  In Collection: CCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 70.87
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.97
-      mIoU(ms+flip): 45.13
-  Config: configs/ccnet/ccnet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848-1f4929a3.pth
-- Name: ccnet_r50-d8_512x512_160k_ade20k
-  In Collection: CCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.08
-      mIoU(ms+flip): 43.13
-  Config: configs/ccnet/ccnet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435-7c97193b.pth
-- Name: ccnet_r101-d8_512x512_160k_ade20k
-  In Collection: CCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.71
-      mIoU(ms+flip): 45.04
-  Config: configs/ccnet/ccnet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644-e849e007.pth
-- Name: ccnet_r50-d8_512x512_20k_voc12aug
-  In Collection: CCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 48.9
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.17
-      mIoU(ms+flip): 77.51
-  Config: configs/ccnet/ccnet_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212-fad81784.pth
-- Name: ccnet_r101-d8_512x512_20k_voc12aug
-  In Collection: CCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 73.31
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.27
-      mIoU(ms+flip): 79.02
-  Config: configs/ccnet/ccnet_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212-0007b61d.pth
-- Name: ccnet_r50-d8_512x512_40k_voc12aug
-  In Collection: CCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 75.96
-      mIoU(ms+flip): 77.04
-  Config: configs/ccnet/ccnet_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127-c2a15f02.pth
-- Name: ccnet_r101-d8_512x512_40k_voc12aug
-  In Collection: CCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.87
-      mIoU(ms+flip): 78.9
-  Config: configs/ccnet/ccnet_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127-c30da577.pth
diff --git a/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..0c49e1edc2
--- /dev/null
+++ b/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..f24f5a70ed
--- /dev/null
+++ b/configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..b358e12c4e
--- /dev/null
+++ b/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..75750768b2
--- /dev/null
+++ b/configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..a29d118f41
--- /dev/null
+++ b/configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..fd421a2ed5
--- /dev/null
+++ b/configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..425dfcf339
--- /dev/null
+++ b/configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..f6dcb9cf50
--- /dev/null
+++ b/configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './ccnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes.py b/configs/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index d2bac38ca6..0000000000
--- a/configs/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ccnet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes.py b/configs/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 989928ab7f..0000000000
--- a/configs/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ccnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_512x512_160k_ade20k.py b/configs/ccnet/ccnet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index c32bf48751..0000000000
--- a/configs/ccnet/ccnet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ccnet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_512x512_20k_voc12aug.py b/configs/ccnet/ccnet_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 53eb77c0cd..0000000000
--- a/configs/ccnet/ccnet_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ccnet_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_512x512_40k_voc12aug.py b/configs/ccnet/ccnet_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index d7eb668f39..0000000000
--- a/configs/ccnet/ccnet_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ccnet_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_512x512_80k_ade20k.py b/configs/ccnet/ccnet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 029c1d525b..0000000000
--- a/configs/ccnet/ccnet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ccnet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_769x769_40k_cityscapes.py b/configs/ccnet/ccnet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 43f05fab05..0000000000
--- a/configs/ccnet/ccnet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ccnet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r101-d8_769x769_80k_cityscapes.py b/configs/ccnet/ccnet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 654f377b6f..0000000000
--- a/configs/ccnet/ccnet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './ccnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes.py b/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/ccnet/ccnet_r50-d8_769x769_40k_cityscapes.py b/configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/ccnet/ccnet_r50-d8_769x769_40k_cityscapes.py
rename to configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes.py b/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/ccnet/ccnet_r50-d8_769x769_80k_cityscapes.py b/configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/ccnet/ccnet_r50-d8_769x769_80k_cityscapes.py
rename to configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/ccnet/ccnet_r50-d8_512x512_160k_ade20k.py b/configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/ccnet/ccnet_r50-d8_512x512_160k_ade20k.py
rename to configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/ccnet/ccnet_r50-d8_512x512_20k_voc12aug.py b/configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/ccnet/ccnet_r50-d8_512x512_20k_voc12aug.py
rename to configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/ccnet/ccnet_r50-d8_512x512_40k_voc12aug.py b/configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/ccnet/ccnet_r50-d8_512x512_40k_voc12aug.py
rename to configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/ccnet/ccnet_r50-d8_512x512_80k_ade20k.py b/configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/ccnet/ccnet_r50-d8_512x512_80k_ade20k.py
rename to configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/ccnet/metafile.yaml b/configs/ccnet/metafile.yaml
new file mode 100644
index 0000000000..62e5694e47
--- /dev/null
+++ b/configs/ccnet/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: CCNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  README: configs/ccnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: ccnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.76
+      mIoU(ms+flip): 78.87
+  Config: configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517-4123f401.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_40k_cityscapes/ccnet_r50-d8_512x1024_40k_cityscapes_20200616_142517.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.35
+      mIoU(ms+flip): 78.19
+  Config: configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540-a3b84ba6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_40k_cityscapes/ccnet_r101-d8_512x1024_40k_cityscapes_20200616_142540.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.46
+      mIoU(ms+flip): 79.93
+  Config: configs/ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125-76d11884.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_40k_cityscapes/ccnet_r50-d8_769x769_40k_cityscapes_20200616_145125.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.94
+      mIoU(ms+flip): 78.62
+  Config: configs/ccnet/ccnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428-4f57c8d0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_40k_cityscapes/ccnet_r101-d8_769x769_40k_cityscapes_20200617_101428.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.03
+      mIoU(ms+flip): 80.16
+  Config: configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421-869a3423.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x1024_80k_cityscapes/ccnet_r50-d8_512x1024_80k_cityscapes_20200617_010421.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.87
+      mIoU(ms+flip): 79.9
+  Config: configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935-ffae8917.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x1024_80k_cityscapes/ccnet_r101-d8_512x1024_80k_cityscapes_20200617_203935.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.29
+      mIoU(ms+flip): 81.08
+  Config: configs/ccnet/ccnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421-73eed8ca.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_769x769_80k_cityscapes/ccnet_r50-d8_769x769_80k_cityscapes_20200617_010421.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.45
+      mIoU(ms+flip): 80.66
+  Config: configs/ccnet/ccnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502-ad3cd481.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_769x769_80k_cityscapes/ccnet_r101-d8_769x769_80k_cityscapes_20200618_011502.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.78
+      mIoU(ms+flip): 42.98
+  Config: configs/ccnet/ccnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848-aa37f61e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_80k_ade20k/ccnet_r50-d8_512x512_80k_ade20k_20200615_014848.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.97
+      mIoU(ms+flip): 45.13
+  Config: configs/ccnet/ccnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848-1f4929a3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_80k_ade20k/ccnet_r101-d8_512x512_80k_ade20k_20200615_014848.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.08
+      mIoU(ms+flip): 43.13
+  Config: configs/ccnet/ccnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435-7c97193b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_160k_ade20k/ccnet_r50-d8_512x512_160k_ade20k_20200616_084435.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.71
+      mIoU(ms+flip): 45.04
+  Config: configs/ccnet/ccnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644-e849e007.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_160k_ade20k/ccnet_r101-d8_512x512_160k_ade20k_20200616_000644.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.17
+      mIoU(ms+flip): 77.51
+  Config: configs/ccnet/ccnet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212-fad81784.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_20k_voc12aug/ccnet_r50-d8_512x512_20k_voc12aug_20200617_193212.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.27
+      mIoU(ms+flip): 79.02
+  Config: configs/ccnet/ccnet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212-0007b61d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_20k_voc12aug/ccnet_r101-d8_512x512_20k_voc12aug_20200617_193212.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 75.96
+      mIoU(ms+flip): 77.04
+  Config: configs/ccnet/ccnet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127-c2a15f02.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r50-d8_512x512_40k_voc12aug/ccnet_r50-d8_512x512_40k_voc12aug_20200613_232127.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
+- Name: ccnet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: CCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.87
+      mIoU(ms+flip): 78.9
+  Config: configs/ccnet/ccnet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - CCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127-c30da577.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ccnet/ccnet_r101-d8_512x512_40k_voc12aug/ccnet_r101-d8_512x512_40k_voc12aug_20200613_232127.log.json
+  Paper:
+    Title: 'CCNet: Criss-Cross Attention for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.11721
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/apc_head.py#L111
+  Framework: PyTorch
diff --git a/configs/cgnet/README.md b/configs/cgnet/README.md
index b0fced44a0..96c9fcf515 100644
--- a/configs/cgnet/README.md
+++ b/configs/cgnet/README.md
@@ -1,6 +1,6 @@
 # CGNet
 
-[CGNet: A Light-weight Context Guided Network for Semantic Segmentation](https://arxiv.org/abs/1811.08201)
+> [CGNet: A Light-weight Context Guided Network for Semantic Segmentation](https://arxiv.org/abs/1811.08201)
 
 ## Introduction
 
@@ -22,6 +22,15 @@ The demand of applying semantic segmentation model on mobile devices has been in
 <img src="https://user-images.githubusercontent.com/24582831/142900351-89559574-79cc-4f57-8f69-5d88765ec38d.png" width="80%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| CGNet  | M3N21    | 680x680   |   60000 | 7.5      | 30.51          | V100   | 65.63 |         68.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes_20201101_110253-4c0b2f2d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes-20201101_110253.log.json)     |
+| CGNet  | M3N21    | 512x1024  |   60000 | 8.3      | 31.14          | V100   | 68.27 |         70.33 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes_20201101_110254-124ea03b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes-20201101_110254.log.json) |
+
 ## Citation
 
 ```bibtext
@@ -35,12 +44,3 @@ The demand of applying semantic segmentation model on mobile devices has been in
   publisher={IEEE}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                            | download                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| CGNet  | M3N21    | 680x680   |   60000 | 7.5      | 30.51          | 65.63 |         68.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/cgnet/cgnet_680x680_60k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes_20201101_110253-4c0b2f2d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes-20201101_110253.log.json)     |
-| CGNet  | M3N21    | 512x1024  |   60000 | 8.3      | 31.14          | 68.27 |         70.33 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/cgnet/cgnet_512x1024_60k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes_20201101_110254-124ea03b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes-20201101_110254.log.json) |
diff --git a/configs/cgnet/cgnet.yml b/configs/cgnet/cgnet.yml
deleted file mode 100644
index bcd6d89c1b..0000000000
--- a/configs/cgnet/cgnet.yml
+++ /dev/null
@@ -1,59 +0,0 @@
-Collections:
-- Name: CGNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/1811.08201
-    Title: 'CGNet: A Light-weight Context Guided Network for Semantic Segmentation'
-  README: configs/cgnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/cgnet.py#L187
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/wutianyiRosun/CGNet
-Models:
-- Name: cgnet_680x680_60k_cityscapes
-  In Collection: CGNet
-  Metadata:
-    backbone: M3N21
-    crop size: (680,680)
-    lr schd: 60000
-    inference time (ms/im):
-    - value: 32.78
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (680,680)
-    Training Memory (GB): 7.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 65.63
-      mIoU(ms+flip): 68.04
-  Config: configs/cgnet/cgnet_680x680_60k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes_20201101_110253-4c0b2f2d.pth
-- Name: cgnet_512x1024_60k_cityscapes
-  In Collection: CGNet
-  Metadata:
-    backbone: M3N21
-    crop size: (512,1024)
-    lr schd: 60000
-    inference time (ms/im):
-    - value: 32.11
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 68.27
-      mIoU(ms+flip): 70.33
-  Config: configs/cgnet/cgnet_512x1024_60k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes_20201101_110254-124ea03b.pth
diff --git a/configs/cgnet/cgnet_512x1024_60k_cityscapes.py b/configs/cgnet/cgnet_512x1024_60k_cityscapes.py
deleted file mode 100644
index fc9ad7c1c9..0000000000
--- a/configs/cgnet/cgnet_512x1024_60k_cityscapes.py
+++ /dev/null
@@ -1,34 +0,0 @@
-_base_ = [
-    '../_base_/models/cgnet.py', '../_base_/datasets/cityscapes.py',
-    '../_base_/default_runtime.py'
-]
-
-# optimizer
-optimizer = dict(type='Adam', lr=0.001, eps=1e-08, weight_decay=0.0005)
-optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
-# learning policy
-param_scheduler = [
-    dict(
-        type='PolyLR',
-        eta_min=1e-4,
-        power=0.9,
-        by_epoch=False,
-        begin=0,
-        end=60000)
-]
-# runtime settings
-total_iters = 60000
-train_cfg = dict(
-    type='IterBasedTrainLoop', max_iters=total_iters, val_interval=4000)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000))
-
-crop_size = (512, 1024)
-data_preprocessor = dict(size=crop_size)
-model = dict(data_preprocessor=data_preprocessor)
-
-train_dataloader = dict(batch_size=8)
-val_dataloader = dict(batch_size=1)
-test_dataloader = val_dataloader
diff --git a/configs/cgnet/cgnet_680x680_60k_cityscapes.py b/configs/cgnet/cgnet_680x680_60k_cityscapes.py
deleted file mode 100644
index 4854b481e1..0000000000
--- a/configs/cgnet/cgnet_680x680_60k_cityscapes.py
+++ /dev/null
@@ -1,55 +0,0 @@
-_base_ = [
-    '../_base_/models/cgnet.py', '../_base_/datasets/cityscapes.py',
-    '../_base_/default_runtime.py'
-]
-
-# optimizer
-optimizer = dict(type='Adam', lr=0.001, eps=1e-08, weight_decay=0.0005)
-optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
-# learning policy
-param_scheduler = [
-    dict(
-        type='PolyLR',
-        eta_min=1e-4,
-        power=0.9,
-        by_epoch=False,
-        begin=0,
-        end=60000)
-]
-# runtime settings
-total_iters = 60000
-train_cfg = dict(
-    type='IterBasedTrainLoop', max_iters=total_iters, val_interval=4000)
-val_cfg = dict(type='ValLoop')
-test_cfg = dict(type='TestLoop')
-default_hooks = dict(
-    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000))
-
-crop_size = (680, 680)
-data_preprocessor = dict(size=crop_size)
-model = dict(data_preprocessor=data_preprocessor)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations'),
-    dict(
-        type='RandomResize',
-        scale=(2048, 1024),
-        ratio_range=(0.5, 2.0),
-        keep_ratio=True),
-    dict(type='RandomCrop', crop_size=crop_size),
-    dict(type='RandomFlip', prob=0.5),
-    dict(type='PackSegInputs')
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
-    # add loading annotation after ``Resize`` because ground truth
-    # does not need to do resize data transform
-    dict(type='LoadAnnotations'),
-    dict(type='PackSegInputs')
-]
-train_dataloader = dict(
-    batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
-val_dataloader = dict(
-    batch_size=1, num_workers=4, dataset=dict(pipeline=test_pipeline))
-test_dataloader = val_dataloader
diff --git a/configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py b/configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py
new file mode 100644
index 0000000000..6a2c0ed125
--- /dev/null
+++ b/configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py
@@ -0,0 +1,59 @@
+_base_ = [
+    '../_base_/models/cgnet.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py'
+]
+
+# optimizer
+optimizer = dict(type='Adam', lr=0.001, eps=1e-08, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        by_epoch=False,
+        begin=0,
+        end=60000)
+]
+# runtime settings
+total_iters = 60000
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iters, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
+    sampler_seed=dict(type='DistSamplerSeedHook'))
+
+crop_size = (680, 680)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=8, num_workers=4, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1, num_workers=4, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py b/configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..8be29de479
--- /dev/null
+++ b/configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/cgnet.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py'
+]
+
+# optimizer
+optimizer = dict(type='Adam', lr=0.001, eps=1e-08, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        by_epoch=False,
+        begin=0,
+        end=60000)
+]
+# runtime settings
+total_iters = 60000
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=total_iters, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
+    sampler_seed=dict(type='DistSamplerSeedHook'))
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+
+train_dataloader = dict(batch_size=8)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/configs/cgnet/metafile.yaml b/configs/cgnet/metafile.yaml
new file mode 100644
index 0000000000..063fc8b3c6
--- /dev/null
+++ b/configs/cgnet/metafile.yaml
@@ -0,0 +1,61 @@
+Collections:
+- Name: CGNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: 'CGNet: A Light-weight Context Guided Network for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.08201
+  README: configs/cgnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: cgnet_fcn_4xb4-60k_cityscapes-680x680
+  In Collection: CGNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 65.63
+      mIoU(ms+flip): 68.04
+  Config: configs/cgnet/cgnet_fcn_4xb4-60k_cityscapes-680x680.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M3N21
+    - CGNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes_20201101_110253-4c0b2f2d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_680x680_60k_cityscapes/cgnet_680x680_60k_cityscapes-20201101_110253.log.json
+  Paper:
+    Title: 'CGNet: A Light-weight Context Guided Network for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.08201
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/cgnet.py#L187
+  Framework: PyTorch
+- Name: cgnet_fcn_4xb8-60k_cityscapes-512x1024
+  In Collection: CGNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 68.27
+      mIoU(ms+flip): 70.33
+  Config: configs/cgnet/cgnet_fcn_4xb8-60k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 32
+    Architecture:
+    - M3N21
+    - CGNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes_20201101_110254-124ea03b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/cgnet/cgnet_512x1024_60k_cityscapes/cgnet_512x1024_60k_cityscapes-20201101_110254.log.json
+  Paper:
+    Title: 'CGNet: A Light-weight Context Guided Network for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1811.08201
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/cgnet.py#L187
+  Framework: PyTorch
diff --git a/configs/convnext/README.md b/configs/convnext/README.md
index 09eb702c7f..d78fe6ee1b 100644
--- a/configs/convnext/README.md
+++ b/configs/convnext/README.md
@@ -1,6 +1,6 @@
 # ConvNeXt
 
-[A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
+> [A ConvNet for the 2020s](https://arxiv.org/abs/2201.03545)
 
 ## Introduction
 
@@ -22,21 +22,12 @@ The "Roaring 20s" of visual recognition began with the introduction of Vision Tr
 <img src="https://user-images.githubusercontent.com/8370623/148624004-e9581042-ea4d-4e10-b3bd-42c92b02053b.png" width="90%"/>
 </div>
 
-```bibtex
-@article{liu2022convnet,
-  title={A ConvNet for the 2020s},
-  author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining},
-  journal={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
-  year={2022}
-}
-```
-
 ### Usage
 
 - ConvNeXt backbone needs to install [MMClassification](https://github.com/open-mmlab/mmclassification) first, which has abundant backbones for downstream tasks.
 
 ```shell
-pip install mmcls>=0.20.1
+pip install mmpretrain>=1.0.0rc7
 ```
 
 ### Pre-trained Models
@@ -58,15 +49,26 @@ The pre-trained models on ImageNet-1k or ImageNet-21k are used to fine-tune on t
 
 ### ADE20K
 
-| Method  | Backbone    | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                                             |
-| ------- | ----------- | --------- | ------- | -------- | -------------- | ----- | ------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UPerNet | ConvNeXt-T  | 512x512   | 160000  | 4.23     | 19.90          | 46.11 | 46.62         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553.log.json)         |
-| UPerNet | ConvNeXt-S  | 512x512   | 160000  | 5.16     | 15.18          | 48.56 | 49.02         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208.log.json)     |
-| UPerNet | ConvNeXt-B  | 512x512   | 160000  | 6.33     | 14.41          | 48.71 | 49.54         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227.log.json)         |
-| UPerNet | ConvNeXt-B  | 640x640   | 160000  | 8.53     | 10.88          | 52.13 | 52.66         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859-9280e39b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859.log.json)         |
-| UPerNet | ConvNeXt-L  | 640x640   | 160000  | 12.08    | 7.69           | 53.16 | 53.38         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532.log.json)     |
-| UPerNet | ConvNeXt-XL | 640x640   | 160000  | 26.16\*  | 6.33           | 53.58 | 54.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344.log.json) |
+| Method  | Backbone    | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------- | ----------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | ConvNeXt-T  | 512x512   | 160000  | 4.23     | 19.90          | V100   | 46.11 | 46.62         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553.log.json)         |
+| UPerNet | ConvNeXt-S  | 512x512   | 160000  | 5.16     | 15.18          | V100   | 48.56 | 49.02         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208.log.json)     |
+| UPerNet | ConvNeXt-B  | 512x512   | 160000  | 6.33     | 14.41          | V100   | 48.71 | 49.54         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227.log.json)         |
+| UPerNet | ConvNeXt-B  | 640x640   | 160000  | 8.53     | 10.88          | V100   | 52.13 | 52.66         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859-9280e39b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859.log.json)         |
+| UPerNet | ConvNeXt-L  | 640x640   | 160000  | 12.08    | 7.69           | V100   | 53.16 | 53.38         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532.log.json)     |
+| UPerNet | ConvNeXt-XL | 640x640   | 160000  | 26.16\*  | 6.33           | V100   | 53.58 | 54.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344.log.json) |
 
 Note:
 
 - `Mem (GB)` with * is collected when `cudnn_benchmark=True`, and hardware is V100.
+
+## Citation
+
+```bibtex
+@article{liu2022convnet,
+  title={A ConvNet for the 2020s},
+  author={Liu, Zhuang and Mao, Hanzi and Wu, Chao-Yuan and Feichtenhofer, Christoph and Darrell, Trevor and Xie, Saining},
+  journal={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2022}
+}
+```
diff --git a/configs/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k.py b/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py
similarity index 100%
rename from configs/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k.py
rename to configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py
diff --git a/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py b/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py
new file mode 100644
index 0000000000..06a8643144
--- /dev/null
+++ b/configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py',
+    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_in21k_20220301-262fd037.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='base',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[128, 256, 512, 1024],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=512, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py b/configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py
new file mode 100644
index 0000000000..2956e86f04
--- /dev/null
+++ b/configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py',
+    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-large_3rdparty_in21k_20220301-e6e0ea0a.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='large',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[192, 384, 768, 1536],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=768, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py b/configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..dbe45f10e0
--- /dev/null
+++ b/configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py
@@ -0,0 +1,57 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-small_3rdparty_32xb128-noema_in1k_20220301-303e75e3.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='small',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.3,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[96, 192, 384, 768],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=384, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py b/configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..d2e545a76d
--- /dev/null
+++ b/configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py
@@ -0,0 +1,57 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='tiny',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[96, 192, 384, 768],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=384, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 6
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py b/configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py
new file mode 100644
index 0000000000..dfad734521
--- /dev/null
+++ b/configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/upernet_convnext.py',
+    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-xlarge_3rdparty_in21k_20220301-08aa5ddc.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='mmpretrain.ConvNeXt',
+        arch='xlarge',
+        out_indices=[0, 1, 2, 3],
+        drop_path_rate=0.4,
+        layer_scale_init_value=1.0,
+        gap_before_final_norm=False,
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    decode_head=dict(
+        in_channels=[256, 512, 1024, 2048],
+        num_classes=150,
+    ),
+    auxiliary_head=dict(in_channels=1024, num_classes=150),
+    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
+)
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00008, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/configs/convnext/convnext.yml b/configs/convnext/convnext.yml
deleted file mode 100644
index 2b943aa151..0000000000
--- a/configs/convnext/convnext.yml
+++ /dev/null
@@ -1,133 +0,0 @@
-Models:
-- Name: upernet_convnext_tiny_fp16_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ConvNeXt-T
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 50.25
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (512,512)
-    Training Memory (GB): 4.23
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.11
-      mIoU(ms+flip): 46.62
-  Config: configs/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth
-- Name: upernet_convnext_small_fp16_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ConvNeXt-S
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 65.88
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (512,512)
-    Training Memory (GB): 5.16
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 48.56
-      mIoU(ms+flip): 49.02
-  Config: configs/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth
-- Name: upernet_convnext_base_fp16_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ConvNeXt-B
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 69.4
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (512,512)
-    Training Memory (GB): 6.33
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 48.71
-      mIoU(ms+flip): 49.54
-  Config: configs/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth
-- Name: upernet_convnext_base_fp16_640x640_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ConvNeXt-B
-    crop size: (640,640)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 91.91
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (640,640)
-    Training Memory (GB): 8.53
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 52.13
-      mIoU(ms+flip): 52.66
-  Config: configs/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859-9280e39b.pth
-- Name: upernet_convnext_large_fp16_640x640_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ConvNeXt-L
-    crop size: (640,640)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 130.04
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (640,640)
-    Training Memory (GB): 12.08
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 53.16
-      mIoU(ms+flip): 53.38
-  Config: configs/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth
-- Name: upernet_convnext_xlarge_fp16_640x640_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ConvNeXt-XL
-    crop size: (640,640)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 157.98
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (640,640)
-    Training Memory (GB): 26.16
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 53.58
-      mIoU(ms+flip): 54.11
-  Config: configs/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth
diff --git a/configs/convnext/metafile.yaml b/configs/convnext/metafile.yaml
new file mode 100644
index 0000000000..8340a373c2
--- /dev/null
+++ b/configs/convnext/metafile.yaml
@@ -0,0 +1,145 @@
+Models:
+- Name: convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.11
+      mIoU(ms+flip): 46.62
+  Config: configs/convnext/convnext-tiny_upernet_8xb2-amp-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-T
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.23
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553-cad485de.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k/upernet_convnext_tiny_fp16_512x512_160k_ade20k_20220227_124553.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-small_upernet_8xb2-amp-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.56
+      mIoU(ms+flip): 49.02
+  Config: configs/convnext/convnext-small_upernet_8xb2-amp-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.16
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208-1b1e394f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k/upernet_convnext_small_fp16_512x512_160k_ade20k_20220227_131208.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-base_upernet_8xb2-amp-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.71
+      mIoU(ms+flip): 49.54
+  Config: configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.33
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227-02a24fc6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_512x512_160k_ade20k/upernet_convnext_base_fp16_512x512_160k_ade20k_20220227_181227.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-base_upernet_8xb2-amp-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.13
+      mIoU(ms+flip): 52.66
+  Config: configs/convnext/convnext-base_upernet_8xb2-amp-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.53
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859-9280e39b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k/upernet_convnext_base_fp16_640x640_160k_ade20k_20220227_182859.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-large_upernet_8xb2-amp-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.16
+      mIoU(ms+flip): 53.38
+  Config: configs/convnext/convnext-large_upernet_8xb2-amp-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-L
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 12.08
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532-e57aa54d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k/upernet_convnext_large_fp16_640x640_160k_ade20k_20220226_040532.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
+- Name: convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.58
+      mIoU(ms+flip): 54.11
+  Config: configs/convnext/convnext-xlarge_upernet_8xb2-amp-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ConvNeXt-XL
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 26.16
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344-95fc38c2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k/upernet_convnext_xlarge_fp16_640x640_160k_ade20k_20220226_080344.log.json
+  Paper:
+    Title: A ConvNet for the 2020s
+    URL: https://arxiv.org/abs/2201.03545
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.20.1/mmcls/models/backbones/convnext.py#L133
+  Framework: PyTorch
diff --git a/configs/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k.py b/configs/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k.py
deleted file mode 100644
index a743e9322a..0000000000
--- a/configs/convnext/upernet_convnext_base_fp16_640x640_160k_ade20k.py
+++ /dev/null
@@ -1,58 +0,0 @@
-_base_ = [
-    '../_base_/models/upernet_convnext.py',
-    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
-    '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (640, 640)
-data_preprocessor = dict(size=crop_size)
-checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-base_3rdparty_in21k_20220301-262fd037.pth'  # noqa
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        type='mmcls.ConvNeXt',
-        arch='base',
-        out_indices=[0, 1, 2, 3],
-        drop_path_rate=0.4,
-        layer_scale_init_value=1.0,
-        gap_before_final_norm=False,
-        init_cfg=dict(
-            type='Pretrained', checkpoint=checkpoint_file,
-            prefix='backbone.')),
-    decode_head=dict(
-        in_channels=[128, 256, 512, 1024],
-        num_classes=150,
-    ),
-    auxiliary_head=dict(in_channels=512, num_classes=150),
-    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
-)
-
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=dict(
-        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
-    paramwise_cfg={
-        'decay_rate': 0.9,
-        'decay_type': 'stage_wise',
-        'num_layers': 12
-    },
-    constructor='LearningRateDecayOptimizerConstructor',
-    loss_scale='dynamic')
-
-param_scheduler = [
-    dict(
-        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
-    dict(
-        type='PolyLR',
-        power=1.0,
-        begin=1500,
-        end=160000,
-        eta_min=0.0,
-        by_epoch=False,
-    )
-]
-
-# By default, models are trained on 8 GPUs with 2 images per GPU
-train_dataloader = dict(batch_size=2)
-val_dataloader = dict(batch_size=1)
-test_dataloader = val_dataloader
diff --git a/configs/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k.py b/configs/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k.py
deleted file mode 100644
index 6d94989ee1..0000000000
--- a/configs/convnext/upernet_convnext_large_fp16_640x640_160k_ade20k.py
+++ /dev/null
@@ -1,58 +0,0 @@
-_base_ = [
-    '../_base_/models/upernet_convnext.py',
-    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
-    '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (640, 640)
-data_preprocessor = dict(size=crop_size)
-checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-large_3rdparty_in21k_20220301-e6e0ea0a.pth'  # noqa
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        type='mmcls.ConvNeXt',
-        arch='large',
-        out_indices=[0, 1, 2, 3],
-        drop_path_rate=0.4,
-        layer_scale_init_value=1.0,
-        gap_before_final_norm=False,
-        init_cfg=dict(
-            type='Pretrained', checkpoint=checkpoint_file,
-            prefix='backbone.')),
-    decode_head=dict(
-        in_channels=[192, 384, 768, 1536],
-        num_classes=150,
-    ),
-    auxiliary_head=dict(in_channels=768, num_classes=150),
-    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
-)
-
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=dict(
-        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
-    paramwise_cfg={
-        'decay_rate': 0.9,
-        'decay_type': 'stage_wise',
-        'num_layers': 12
-    },
-    constructor='LearningRateDecayOptimizerConstructor',
-    loss_scale='dynamic')
-
-param_scheduler = [
-    dict(
-        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
-    dict(
-        type='PolyLR',
-        power=1.0,
-        begin=1500,
-        end=160000,
-        eta_min=0.0,
-        by_epoch=False,
-    )
-]
-
-# By default, models are trained on 8 GPUs with 2 images per GPU
-train_dataloader = dict(batch_size=2)
-val_dataloader = dict(batch_size=1)
-test_dataloader = val_dataloader
diff --git a/configs/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k.py b/configs/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k.py
deleted file mode 100644
index 3cbf09902d..0000000000
--- a/configs/convnext/upernet_convnext_small_fp16_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,57 +0,0 @@
-_base_ = [
-    '../_base_/models/upernet_convnext.py', '../_base_/datasets/ade20k.py',
-    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (512, 512)
-data_preprocessor = dict(size=crop_size)
-checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-small_3rdparty_32xb128-noema_in1k_20220301-303e75e3.pth'  # noqa
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        type='mmcls.ConvNeXt',
-        arch='small',
-        out_indices=[0, 1, 2, 3],
-        drop_path_rate=0.3,
-        layer_scale_init_value=1.0,
-        gap_before_final_norm=False,
-        init_cfg=dict(
-            type='Pretrained', checkpoint=checkpoint_file,
-            prefix='backbone.')),
-    decode_head=dict(
-        in_channels=[96, 192, 384, 768],
-        num_classes=150,
-    ),
-    auxiliary_head=dict(in_channels=384, num_classes=150),
-    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
-)
-
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=dict(
-        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
-    paramwise_cfg={
-        'decay_rate': 0.9,
-        'decay_type': 'stage_wise',
-        'num_layers': 12
-    },
-    constructor='LearningRateDecayOptimizerConstructor',
-    loss_scale='dynamic')
-
-param_scheduler = [
-    dict(
-        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
-    dict(
-        type='PolyLR',
-        power=1.0,
-        begin=1500,
-        end=160000,
-        eta_min=0.0,
-        by_epoch=False,
-    )
-]
-
-# By default, models are trained on 8 GPUs with 2 images per GPU
-train_dataloader = dict(batch_size=2)
-val_dataloader = dict(batch_size=1)
-test_dataloader = val_dataloader
diff --git a/configs/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k.py b/configs/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k.py
deleted file mode 100644
index 9d4968df60..0000000000
--- a/configs/convnext/upernet_convnext_tiny_fp16_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,57 +0,0 @@
-_base_ = [
-    '../_base_/models/upernet_convnext.py', '../_base_/datasets/ade20k.py',
-    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (512, 512)
-data_preprocessor = dict(size=crop_size)
-checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-tiny_3rdparty_32xb128-noema_in1k_20220301-795e9634.pth'  # noqa
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        type='mmcls.ConvNeXt',
-        arch='tiny',
-        out_indices=[0, 1, 2, 3],
-        drop_path_rate=0.4,
-        layer_scale_init_value=1.0,
-        gap_before_final_norm=False,
-        init_cfg=dict(
-            type='Pretrained', checkpoint=checkpoint_file,
-            prefix='backbone.')),
-    decode_head=dict(
-        in_channels=[96, 192, 384, 768],
-        num_classes=150,
-    ),
-    auxiliary_head=dict(in_channels=384, num_classes=150),
-    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(341, 341)),
-)
-
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=dict(
-        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
-    paramwise_cfg={
-        'decay_rate': 0.9,
-        'decay_type': 'stage_wise',
-        'num_layers': 6
-    },
-    constructor='LearningRateDecayOptimizerConstructor',
-    loss_scale='dynamic')
-
-param_scheduler = [
-    dict(
-        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
-    dict(
-        type='PolyLR',
-        power=1.0,
-        begin=1500,
-        end=160000,
-        eta_min=0.0,
-        by_epoch=False,
-    )
-]
-
-# By default, models are trained on 8 GPUs with 2 images per GPU
-train_dataloader = dict(batch_size=2)
-val_dataloader = dict(batch_size=1)
-test_dataloader = val_dataloader
diff --git a/configs/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k.py b/configs/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k.py
deleted file mode 100644
index 749391cac1..0000000000
--- a/configs/convnext/upernet_convnext_xlarge_fp16_640x640_160k_ade20k.py
+++ /dev/null
@@ -1,58 +0,0 @@
-_base_ = [
-    '../_base_/models/upernet_convnext.py',
-    '../_base_/datasets/ade20k_640x640.py', '../_base_/default_runtime.py',
-    '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (640, 640)
-data_preprocessor = dict(size=crop_size)
-checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/convnext/downstream/convnext-xlarge_3rdparty_in21k_20220301-08aa5ddc.pth'  # noqa
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        type='mmcls.ConvNeXt',
-        arch='xlarge',
-        out_indices=[0, 1, 2, 3],
-        drop_path_rate=0.4,
-        layer_scale_init_value=1.0,
-        gap_before_final_norm=False,
-        init_cfg=dict(
-            type='Pretrained', checkpoint=checkpoint_file,
-            prefix='backbone.')),
-    decode_head=dict(
-        in_channels=[256, 512, 1024, 2048],
-        num_classes=150,
-    ),
-    auxiliary_head=dict(in_channels=1024, num_classes=150),
-    test_cfg=dict(mode='slide', crop_size=crop_size, stride=(426, 426)),
-)
-
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=dict(
-        type='AdamW', lr=0.00008, betas=(0.9, 0.999), weight_decay=0.05),
-    paramwise_cfg={
-        'decay_rate': 0.9,
-        'decay_type': 'stage_wise',
-        'num_layers': 12
-    },
-    constructor='LearningRateDecayOptimizerConstructor',
-    loss_scale='dynamic')
-
-param_scheduler = [
-    dict(
-        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
-    dict(
-        type='PolyLR',
-        power=1.0,
-        begin=1500,
-        end=160000,
-        eta_min=0.0,
-        by_epoch=False,
-    )
-]
-
-# By default, models are trained on 8 GPUs with 2 images per GPU
-train_dataloader = dict(batch_size=2)
-val_dataloader = dict(batch_size=1)
-test_dataloader = val_dataloader
diff --git a/configs/danet/README.md b/configs/danet/README.md
index ac7634026a..90194f3073 100644
--- a/configs/danet/README.md
+++ b/configs/danet/README.md
@@ -1,6 +1,6 @@
 # DANet
 
-[Dual Attention Network for Scene Segmentation](https://arxiv.org/abs/1809.02983)
+> [Dual Attention Network for Scene Segmentation](https://arxiv.org/abs/1809.02983)
 
 ## Introduction
 
@@ -22,46 +22,46 @@ In this paper, we address the scene segmentation task by capturing rich contextu
 <img src="https://user-images.githubusercontent.com/24582831/142900467-f832fdb9-3b7d-47d3-8e80-e6ee9303bdfb.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{fu2018dual,
-  title={Dual Attention Network for Scene Segmentation},
-  author={Jun Fu, Jing Liu, Haijie Tian, Yong Li, Yongjun Bao, Zhiwei Fang,and Hanqing Lu},
-  booktitle={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
-  year={2019}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DANet  | R-50-D8  | 512x1024  |   40000 | 7.4      | 2.66           | 78.74 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324-c0dbfa5f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324.log.json)     |
-| DANet  | R-101-D8 | 512x1024  |   40000 | 10.9     | 1.99           | 80.52 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831-c57a7157.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831.log.json) |
-| DANet  | R-50-D8  | 769x769   |   40000 | 8.8      | 1.56           | 78.88 | 80.62         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703-76681c60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703.log.json)         |
-| DANet  | R-101-D8 | 769x769   |   40000 | 12.8     | 1.07           | 79.88 | 81.47         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717-dcb7fd4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717.log.json)     |
-| DANet  | R-50-D8  | 512x1024  |   80000 | -        | -              | 79.34 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029-2bfa2293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029.log.json)     |
-| DANet  | R-101-D8 | 512x1024  |   80000 | -        | -              | 80.41 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918-955e6350.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918.log.json) |
-| DANet  | R-50-D8  | 769x769   |   80000 | -        | -              | 79.27 | 80.96         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954-495689b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954.log.json)         |
-| DANet  | R-101-D8 | 769x769   |   80000 | -        | -              | 80.47 | 82.02         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918-f3a929e7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918.log.json)     |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DANet  | R-50-D8  | 512x1024  |   40000 | 7.4      | 2.66           | V100   | 78.74 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324-c0dbfa5f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324.log.json)     |
+| DANet  | R-101-D8 | 512x1024  |   40000 | 10.9     | 1.99           | V100   | 80.52 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831-c57a7157.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831.log.json) |
+| DANet  | R-50-D8  | 769x769   |   40000 | 8.8      | 1.56           | V100   | 78.88 | 80.62         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703-76681c60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703.log.json)         |
+| DANet  | R-101-D8 | 769x769   |   40000 | 12.8     | 1.07           | V100   | 79.88 | 81.47         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717-dcb7fd4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717.log.json)     |
+| DANet  | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 79.34 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029-2bfa2293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029.log.json)     |
+| DANet  | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 80.41 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918-955e6350.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918.log.json) |
+| DANet  | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.27 | 80.96         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954-495689b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954.log.json)         |
+| DANet  | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 80.47 | 82.02         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918-f3a929e7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918.log.json)     |
 
 ### ADE20K
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DANet  | R-50-D8  | 512x512   |   80000 | 11.5     | 21.20          | 41.66 |         42.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125-edb18e08.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125.log.json)         |
-| DANet  | R-101-D8 | 512x512   |   80000 | 15       | 14.18          | 43.64 |         45.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126-d0357c73.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126.log.json)     |
-| DANet  | R-50-D8  | 512x512   |  160000 | -        | -              | 42.45 |         43.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340-9cb35dcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340.log.json)     |
-| DANet  | R-101-D8 | 512x512   |  160000 | -        | -              | 44.17 |         45.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348-23bf12f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DANet  | R-50-D8  | 512x512   |   80000 | 11.5     | 21.20          | V100   | 41.66 |         42.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125-edb18e08.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125.log.json)         |
+| DANet  | R-101-D8 | 512x512   |   80000 | 15       | 14.18          | V100   | 43.64 |         45.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126-d0357c73.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126.log.json)     |
+| DANet  | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.45 |         43.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340-9cb35dcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340.log.json)     |
+| DANet  | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 44.17 |         45.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348-23bf12f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348.log.json) |
 
 ### Pascal VOC 2012 + Aug
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                               |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DANet  | R-50-D8  | 512x512   |   20000 | 6.5      | 20.94          | 74.45 |         75.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026-9e9e3ab3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026.log.json)     |
-| DANet  | R-101-D8 | 512x512   |   20000 | 9.9      | 13.76          | 76.02 |         77.23 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026-d48d23b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026.log.json) |
-| DANet  | R-50-D8  | 512x512   |   40000 | -        | -              | 76.37 |         77.29 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526-426e3a64.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526.log.json)     |
-| DANet  | R-101-D8 | 512x512   |   40000 | -        | -              | 76.51 |         77.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet/danet_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031-788e232a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DANet  | R-50-D8  | 512x512   |   20000 | 6.5      | 20.94          | V100   | 74.45 |         75.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026-9e9e3ab3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026.log.json)     |
+| DANet  | R-101-D8 | 512x512   |   20000 | 9.9      | 13.76          | V100   | 76.02 |         77.23 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026-d48d23b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026.log.json) |
+| DANet  | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.37 |         77.29 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526-426e3a64.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526.log.json)     |
+| DANet  | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 76.51 |         77.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031-788e232a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031.log.json) |
+
+## Citation
+
+```bibtex
+@article{fu2018dual,
+  title={Dual Attention Network for Scene Segmentation},
+  author={Jun Fu, Jing Liu, Haijie Tian, Yong Li, Yongjun Bao, Zhiwei Fang,and Hanqing Lu},
+  booktitle={The IEEE Conference on Computer Vision and Pattern Recognition (CVPR)},
+  year={2019}
+}
+```
diff --git a/configs/danet/danet.yml b/configs/danet/danet.yml
deleted file mode 100644
index ca2d6ff982..0000000000
--- a/configs/danet/danet.yml
+++ /dev/null
@@ -1,301 +0,0 @@
-Collections:
-- Name: DANet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://arxiv.org/abs/1809.02983
-    Title: Dual Attention Network for Scene Segmentation
-  README: configs/danet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/junfu1115/DANet/
-Models:
-- Name: danet_r50-d8_512x1024_40k_cityscapes
-  In Collection: DANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 375.94
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.74
-  Config: configs/danet/danet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324-c0dbfa5f.pth
-- Name: danet_r101-d8_512x1024_40k_cityscapes
-  In Collection: DANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 502.51
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 10.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.52
-  Config: configs/danet/danet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831-c57a7157.pth
-- Name: danet_r50-d8_769x769_40k_cityscapes
-  In Collection: DANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 641.03
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 8.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.88
-      mIoU(ms+flip): 80.62
-  Config: configs/danet/danet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703-76681c60.pth
-- Name: danet_r101-d8_769x769_40k_cityscapes
-  In Collection: DANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 934.58
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 12.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.88
-      mIoU(ms+flip): 81.47
-  Config: configs/danet/danet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717-dcb7fd4e.pth
-- Name: danet_r50-d8_512x1024_80k_cityscapes
-  In Collection: DANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.34
-  Config: configs/danet/danet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029-2bfa2293.pth
-- Name: danet_r101-d8_512x1024_80k_cityscapes
-  In Collection: DANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.41
-  Config: configs/danet/danet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918-955e6350.pth
-- Name: danet_r50-d8_769x769_80k_cityscapes
-  In Collection: DANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.27
-      mIoU(ms+flip): 80.96
-  Config: configs/danet/danet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954-495689b4.pth
-- Name: danet_r101-d8_769x769_80k_cityscapes
-  In Collection: DANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.47
-      mIoU(ms+flip): 82.02
-  Config: configs/danet/danet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918-f3a929e7.pth
-- Name: danet_r50-d8_512x512_80k_ade20k
-  In Collection: DANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 47.17
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 11.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.66
-      mIoU(ms+flip): 42.9
-  Config: configs/danet/danet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125-edb18e08.pth
-- Name: danet_r101-d8_512x512_80k_ade20k
-  In Collection: DANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 70.52
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 15.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.64
-      mIoU(ms+flip): 45.19
-  Config: configs/danet/danet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126-d0357c73.pth
-- Name: danet_r50-d8_512x512_160k_ade20k
-  In Collection: DANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.45
-      mIoU(ms+flip): 43.25
-  Config: configs/danet/danet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340-9cb35dcd.pth
-- Name: danet_r101-d8_512x512_160k_ade20k
-  In Collection: DANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.17
-      mIoU(ms+flip): 45.02
-  Config: configs/danet/danet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348-23bf12f9.pth
-- Name: danet_r50-d8_512x512_20k_voc12aug
-  In Collection: DANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 47.76
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 74.45
-      mIoU(ms+flip): 75.69
-  Config: configs/danet/danet_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026-9e9e3ab3.pth
-- Name: danet_r101-d8_512x512_20k_voc12aug
-  In Collection: DANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 72.67
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.02
-      mIoU(ms+flip): 77.23
-  Config: configs/danet/danet_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026-d48d23b2.pth
-- Name: danet_r50-d8_512x512_40k_voc12aug
-  In Collection: DANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.37
-      mIoU(ms+flip): 77.29
-  Config: configs/danet/danet_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526-426e3a64.pth
-- Name: danet_r101-d8_512x512_40k_voc12aug
-  In Collection: DANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.51
-      mIoU(ms+flip): 77.32
-  Config: configs/danet/danet_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031-788e232a.pth
diff --git a/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..4602f3318f
--- /dev/null
+++ b/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..a08c18ee46
--- /dev/null
+++ b/configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..98b1c6490b
--- /dev/null
+++ b/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..9affe306cb
--- /dev/null
+++ b/configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..0079ad65e8
--- /dev/null
+++ b/configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..48444514b7
--- /dev/null
+++ b/configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..2f2df7a595
--- /dev/null
+++ b/configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..dd75bc16b8
--- /dev/null
+++ b/configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './danet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_512x1024_40k_cityscapes.py b/configs/danet/danet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index 3bfb9bdb30..0000000000
--- a/configs/danet/danet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './danet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_512x1024_80k_cityscapes.py b/configs/danet/danet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index d80b2ec160..0000000000
--- a/configs/danet/danet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './danet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_512x512_160k_ade20k.py b/configs/danet/danet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 0f22d0fb63..0000000000
--- a/configs/danet/danet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './danet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_512x512_20k_voc12aug.py b/configs/danet/danet_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 709f93cba3..0000000000
--- a/configs/danet/danet_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './danet_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_512x512_40k_voc12aug.py b/configs/danet/danet_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index 5c623eb568..0000000000
--- a/configs/danet/danet_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './danet_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_512x512_80k_ade20k.py b/configs/danet/danet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index bd31bc8f28..0000000000
--- a/configs/danet/danet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './danet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_769x769_40k_cityscapes.py b/configs/danet/danet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 597d76de79..0000000000
--- a/configs/danet/danet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './danet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r101-d8_769x769_80k_cityscapes.py b/configs/danet/danet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 70f9b31966..0000000000
--- a/configs/danet/danet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './danet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/danet/danet_r50-d8_512x1024_40k_cityscapes.py b/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/danet/danet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/danet/danet_r50-d8_769x769_40k_cityscapes.py b/configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/danet/danet_r50-d8_769x769_40k_cityscapes.py
rename to configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/danet/danet_r50-d8_512x1024_80k_cityscapes.py b/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/danet/danet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/danet/danet_r50-d8_769x769_80k_cityscapes.py b/configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/danet/danet_r50-d8_769x769_80k_cityscapes.py
rename to configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/danet/danet_r50-d8_512x512_160k_ade20k.py b/configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/danet/danet_r50-d8_512x512_160k_ade20k.py
rename to configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/danet/danet_r50-d8_512x512_20k_voc12aug.py b/configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/danet/danet_r50-d8_512x512_20k_voc12aug.py
rename to configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/danet/danet_r50-d8_512x512_40k_voc12aug.py b/configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/danet/danet_r50-d8_512x512_40k_voc12aug.py
rename to configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/danet/danet_r50-d8_512x512_80k_ade20k.py b/configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/danet/danet_r50-d8_512x512_80k_ade20k.py
rename to configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/danet/metafile.yaml b/configs/danet/metafile.yaml
new file mode 100644
index 0000000000..daff925baf
--- /dev/null
+++ b/configs/danet/metafile.yaml
@@ -0,0 +1,387 @@
+Collections:
+- Name: DANet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  README: configs/danet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: danet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.74
+  Config: configs/danet/danet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324-c0dbfa5f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_40k_cityscapes/danet_r50-d8_512x1024_40k_cityscapes_20200605_191324.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.52
+  Config: configs/danet/danet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831-c57a7157.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_40k_cityscapes/danet_r101-d8_512x1024_40k_cityscapes_20200605_200831.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.88
+      mIoU(ms+flip): 80.62
+  Config: configs/danet/danet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703-76681c60.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_40k_cityscapes/danet_r50-d8_769x769_40k_cityscapes_20200530_025703.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.88
+      mIoU(ms+flip): 81.47
+  Config: configs/danet/danet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717-dcb7fd4e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_40k_cityscapes/danet_r101-d8_769x769_40k_cityscapes_20200530_025717.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.34
+  Config: configs/danet/danet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029-2bfa2293.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x1024_80k_cityscapes/danet_r50-d8_512x1024_80k_cityscapes_20200607_133029.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.41
+  Config: configs/danet/danet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918-955e6350.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x1024_80k_cityscapes/danet_r101-d8_512x1024_80k_cityscapes_20200607_132918.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.27
+      mIoU(ms+flip): 80.96
+  Config: configs/danet/danet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954-495689b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_769x769_80k_cityscapes/danet_r50-d8_769x769_80k_cityscapes_20200607_132954.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.47
+      mIoU(ms+flip): 82.02
+  Config: configs/danet/danet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918-f3a929e7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_769x769_80k_cityscapes/danet_r101-d8_769x769_80k_cityscapes_20200607_132918.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.66
+      mIoU(ms+flip): 42.9
+  Config: configs/danet/danet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125-edb18e08.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_80k_ade20k/danet_r50-d8_512x512_80k_ade20k_20200615_015125.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.64
+      mIoU(ms+flip): 45.19
+  Config: configs/danet/danet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126-d0357c73.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_80k_ade20k/danet_r101-d8_512x512_80k_ade20k_20200615_015126.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.45
+      mIoU(ms+flip): 43.25
+  Config: configs/danet/danet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340-9cb35dcd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_160k_ade20k/danet_r50-d8_512x512_160k_ade20k_20200616_082340.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.17
+      mIoU(ms+flip): 45.02
+  Config: configs/danet/danet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348-23bf12f9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_160k_ade20k/danet_r101-d8_512x512_160k_ade20k_20200616_082348.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.45
+      mIoU(ms+flip): 75.69
+  Config: configs/danet/danet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026-9e9e3ab3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_20k_voc12aug/danet_r50-d8_512x512_20k_voc12aug_20200618_070026.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.02
+      mIoU(ms+flip): 77.23
+  Config: configs/danet/danet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026-d48d23b2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_20k_voc12aug/danet_r101-d8_512x512_20k_voc12aug_20200618_070026.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.37
+      mIoU(ms+flip): 77.29
+  Config: configs/danet/danet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526-426e3a64.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r50-d8_512x512_40k_voc12aug/danet_r50-d8_512x512_40k_voc12aug_20200613_235526.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
+- Name: danet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.51
+      mIoU(ms+flip): 77.32
+  Config: configs/danet/danet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031-788e232a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/danet/danet_r101-d8_512x512_40k_voc12aug/danet_r101-d8_512x512_40k_voc12aug_20200613_223031.log.json
+  Paper:
+    Title: Dual Attention Network for Scene Segmentation
+    URL: https://arxiv.org/abs/1809.02983
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/da_head.py#L76
+  Framework: PyTorch
diff --git a/configs/ddrnet/README.md b/configs/ddrnet/README.md
new file mode 100644
index 0000000000..ccbfcdff35
--- /dev/null
+++ b/configs/ddrnet/README.md
@@ -0,0 +1,46 @@
+# DDRNet
+
+> [Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation of Road Scenes](http://arxiv.org/abs/2101.06085)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/ydhongHIT/DDRNet">Official Repo</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Semantic segmentation is a key technology for autonomous vehicles to understand the surrounding scenes. The appealing performances of contemporary models usually come at the expense of heavy computations and lengthy inference time, which is intolerable for self-driving. Using light-weight architectures (encoder-decoder or two-pathway) or reasoning on low-resolution images, recent methods realize very fast scene parsing, even running at more than 100 FPS on a single 1080Ti GPU. However, there is still a signiﬁcant gap in performance between these real-time methods and the models based on dilation backbones. To tackle this problem, we proposed a family of efﬁcient backbones specially designed for real-time semantic segmentation. The proposed deep dual-resolution networks (DDRNets) are composed of two deep branches between which multiple bilateral fusions are performed. Additionally, we design a new contextual information extractor named Deep Aggregation Pyramid Pooling Module (DAPPM) to enlarge effective receptive ﬁelds and fuse multi-scale context based on low-resolution feature maps. Our method achieves a new state-of-the-art trade-off between accuracy and speed on both Cityscapes and CamVid dataset. In particular, on a single 2080Ti GPU, DDRNet-23-slim yields 77.4% mIoU at 102 FPS on Cityscapes test set and 74.7% mIoU at 230 FPS on CamVid test set. With widely used test augmentation, our method is superior to most state-of-the-art models and requires much less computation. Codes and trained models are available online.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/ydhongHIT/DDRNet/main/figs/DDRNet_seg.png" width="80%"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone      | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------ | ------------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DDRNet | DDRNet23-slim | 1024x1024 | 120000  | 1.70     | 85.85          | A100   | 77.84 | 80.15         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230426_145312-6a5e5174.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230426_145312.json) |
+| DDRNet | DDRNet23      | 1024x1024 | 120000  | 7.26     | 33.41          | A100   | 79.99 | 81.71         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230425_162633-81601db0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230425_162633.json)                     |
+
+## Notes
+
+The pretrained weights in config files are converted from [the official repo](https://github.com/ydhongHIT/DDRNet#pretrained-models).
+
+## Citation
+
+```bibtex
+@article{pan2022deep,
+  title={Deep Dual-Resolution Networks for Real-Time and Accurate Semantic Segmentation of Traffic Scenes},
+  author={Pan, Huihui and Hong, Yuanduo and Sun, Weichao and Jia, Yisong},
+  journal={IEEE Transactions on Intelligent Transportation Systems},
+  year={2022},
+  publisher={IEEE}
+}
+```
diff --git a/configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py b/configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..65b0ead547
--- /dev/null
+++ b/configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
@@ -0,0 +1,93 @@
+_base_ = [
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py',
+]
+
+# The class_weight is borrowed from https://github.com/openseg-group/OCNet.pytorch/issues/14 # noqa
+# Licensed under the MIT License
+class_weight = [
+    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754, 1.0489, 0.8786,
+    1.0023, 0.9539, 0.9843, 1.1116, 0.9037, 1.0865, 1.0955, 1.0865, 1.1529,
+    1.0507
+]
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/pretrain/ddrnet23s-in1kpre_3rdparty-1ccac5b1.pth'  # noqa
+crop_size = (1024, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DDRNet',
+        in_channels=3,
+        channels=32,
+        ppm_channels=128,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(
+        type='DDRHead',
+        in_channels=32 * 4,
+        channels=64,
+        dropout_ratio=0.,
+        num_classes=19,
+        align_corners=False,
+        norm_cfg=norm_cfg,
+        loss_decode=[
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=1.0),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=0.4),
+        ]),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+train_dataloader = dict(batch_size=6, num_workers=4)
+
+iters = 120000
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=iters,
+        by_epoch=False)
+]
+
+# training schedule for 120k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iters, val_interval=iters // 10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=iters // 10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+randomness = dict(seed=304)
diff --git a/configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py b/configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..444efe2b88
--- /dev/null
+++ b/configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
@@ -0,0 +1,93 @@
+_base_ = [
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py',
+]
+
+# The class_weight is borrowed from https://github.com/openseg-group/OCNet.pytorch/issues/14 # noqa
+# Licensed under the MIT License
+class_weight = [
+    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754, 1.0489, 0.8786,
+    1.0023, 0.9539, 0.9843, 1.1116, 0.9037, 1.0865, 1.0955, 1.0865, 1.1529,
+    1.0507
+]
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/pretrain/ddrnet23-in1kpre_3rdparty-9ca29f62.pth'  # noqa
+crop_size = (1024, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='DDRNet',
+        in_channels=3,
+        channels=64,
+        ppm_channels=128,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(
+        type='DDRHead',
+        in_channels=64 * 4,
+        channels=128,
+        dropout_ratio=0.,
+        num_classes=19,
+        align_corners=False,
+        norm_cfg=norm_cfg,
+        loss_decode=[
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=1.0),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=0.4),
+        ]),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+train_dataloader = dict(batch_size=6, num_workers=4)
+
+iters = 120000
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=iters,
+        by_epoch=False)
+]
+
+# training schedule for 120k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iters, val_interval=iters // 10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=iters // 10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+randomness = dict(seed=304)
diff --git a/configs/ddrnet/metafile.yaml b/configs/ddrnet/metafile.yaml
new file mode 100644
index 0000000000..07074702c2
--- /dev/null
+++ b/configs/ddrnet/metafile.yaml
@@ -0,0 +1,64 @@
+Collections:
+- Name: DDRNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation
+      of Road Scenes
+    URL: http://arxiv.org/abs/2101.06085
+  README: configs/ddrnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024
+  In Collection: DDRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.84
+      mIoU(ms+flip): 80.15
+  Config: configs/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - DDRNet23-slim
+    - DDRNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230426_145312-6a5e5174.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23-slim_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230426_145312.json
+  Paper:
+    Title: Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation
+      of Road Scenes
+    URL: http://arxiv.org/abs/2101.06085
+  Code: ''
+  Framework: PyTorch
+- Name: ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024
+  In Collection: DDRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.99
+      mIoU(ms+flip): 81.71
+  Config: configs/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - DDRNet23
+    - DDRNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 7.26
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230425_162633-81601db0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ddrnet/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024/ddrnet_23_in1k-pre_2xb6-120k_cityscapes-1024x1024_20230425_162633.json
+  Paper:
+    Title: Deep Dual-resolution Networks for Real-time and Accurate Semantic Segmentation
+      of Road Scenes
+    URL: http://arxiv.org/abs/2101.06085
+  Code: ''
+  Framework: PyTorch
diff --git a/configs/deeplabv3/README.md b/configs/deeplabv3/README.md
index 49856607b1..df50b7f90a 100644
--- a/configs/deeplabv3/README.md
+++ b/configs/deeplabv3/README.md
@@ -1,6 +1,6 @@
 # DeepLabV3
 
-[Rethinking atrous convolution for semantic image segmentation](https://arxiv.org/abs/1706.05587)
+> [Rethinking atrous convolution for semantic image segmentation](https://arxiv.org/abs/1706.05587)
 
 ## Introduction
 
@@ -18,100 +18,101 @@ In this work, we revisit atrous convolution, a powerful tool to explicitly adjus
 
 <!-- [IMAGE] -->
 
-<div align=center>
-<img src="https://user-images.githubusercontent.com/24582831/142900575-f30a7755-09aa-406a-bf78-45893a61ee9a.png" width="80%"/>
+<div align=center >
+<img alt="DEEPLABv3_ResNet-D8" src="https://user-images.githubusercontent.com/61172629/209305311-87ff9e36-b7cd-46d7-8b4c-9e26e10c27d0.jpg"/>
+DEEPLABv3_ResNet-D8 model structure
 </div>
 
-## Citation
-
-```bibtext
-@article{chen2017rethinking,
-  title={Rethinking atrous convolution for semantic image segmentation},
-  author={Chen, Liang-Chieh and Papandreou, George and Schroff, Florian and Adam, Hartwig},
-  journal={arXiv preprint arXiv:1706.05587},
-  year={2017}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method           | Backbone        | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                               |
-| ---------------- | --------------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| DeepLabV3        | R-50-D8         | 512x1024  |   40000 | 6.1      | 2.57           | 79.09 |         80.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449-acadc2f8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449.log.json)                                 |
-| DeepLabV3        | R-101-D8        | 512x1024  |   40000 | 9.6      | 1.92           | 77.12 |         79.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241-7fd3f799.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241.log.json)                             |
-| DeepLabV3        | R-50-D8         | 769x769   |   40000 | 6.9      | 1.11           | 78.58 |         79.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723-7eda553c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723.log.json)                                     |
-| DeepLabV3        | R-101-D8        | 769x769   |   40000 | 10.9     | 0.83           | 79.27 |         80.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809-c64f889f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809.log.json)                                 |
-| DeepLabV3        | R-18-D8         | 512x1024  |   80000 | 1.7      | 13.78          | 76.70 |         78.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes_20201225_021506-23dffbe2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes-20201225_021506.log.json)                                 |
-| DeepLabV3        | R-50-D8         | 512x1024  |   80000 | -        | -              | 79.32 |         80.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404-b92cfdd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404.log.json)                                 |
-| DeepLabV3        | R-101-D8        | 512x1024  |   80000 | -        | -              | 80.20 |         81.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503-9e428899.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503.log.json)                             |
-| DeepLabV3 (FP16) | R-101-D8        | 512x1024  |   80000 | 5.75     | 3.86           | 80.48 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-774d9cec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json)         |
-| DeepLabV3        | R-18-D8         | 769x769   |   80000 | 1.9      | 5.55           | 76.60 |         78.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes_20201225_021506-6452126a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes-20201225_021506.log.json)                                     |
-| DeepLabV3        | R-50-D8         | 769x769   |   80000 | -        | -              | 79.89 |         81.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338-788d6228.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338.log.json)                                     |
-| DeepLabV3        | R-101-D8        | 769x769   |   80000 | -        | -              | 79.67 |         80.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353-60e95418.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353.log.json)                                 |
-| DeepLabV3        | R-101-D16-MG124 | 512x1024  |   40000 | 4.7      | - 6.96         | 76.71 |         78.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-67b0c992.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json) |
-| DeepLabV3        | R-101-D16-MG124 | 512x1024  |   80000 | -        | -              | 78.36 |         79.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-57bb8425.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json) |
-| DeepLabV3        | R-18b-D8        | 512x1024  |   80000 | 1.6      | 13.93          | 76.26 |         77.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes_20201225_094144-46040cef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes-20201225_094144.log.json)                             |
-| DeepLabV3        | R-50b-D8        | 512x1024  |   80000 | 6.0      | 2.74           | 79.63 |         80.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes_20201225_155148-ec368954.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes-20201225_155148.log.json)                             |
-| DeepLabV3        | R-101b-D8       | 512x1024  |   80000 | 9.5      | 1.81           | 80.01 |         81.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes_20201226_171821-8fd49503.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes-20201226_171821.log.json)                         |
-| DeepLabV3        | R-18b-D8        | 769x769   |   80000 | 1.8      | 5.79           | 76.63 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes_20201225_094144-fdc985d9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes-20201225_094144.log.json)                                 |
-| DeepLabV3        | R-50b-D8        | 769x769   |   80000 | 6.8      | 1.16           | 78.80 |         80.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes_20201225_155404-87fb0cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes-20201225_155404.log.json)                                 |
-| DeepLabV3        | R-101b-D8       | 769x769   |   80000 | 10.7     | 0.82           | 79.41 |         80.73 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes_20201226_190843-9142ee57.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes-20201226_190843.log.json)                             |
+| Method           | Backbone        | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------------- | --------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DeepLabV3        | R-50-D8         | 512x1024  |   40000 | 6.1      | 2.57           | V100   | 79.09 |         80.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449-acadc2f8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449.log.json)                                 |
+| DeepLabV3        | R-101-D8        | 512x1024  |   40000 | 9.6      | 1.92           | V100   | 77.12 |         79.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241-7fd3f799.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241.log.json)                             |
+| DeepLabV3        | R-50-D8         | 769x769   |   40000 | 6.9      | 1.11           | V100   | 78.58 |         79.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723-7eda553c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723.log.json)                                     |
+| DeepLabV3        | R-101-D8        | 769x769   |   40000 | 10.9     | 0.83           | V100   | 79.27 |         80.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809-c64f889f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809.log.json)                                 |
+| DeepLabV3        | R-18-D8         | 512x1024  |   80000 | 1.7      | 13.78          | V100   | 76.70 |         78.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes_20201225_021506-23dffbe2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes-20201225_021506.log.json)                                 |
+| DeepLabV3        | R-50-D8         | 512x1024  |   80000 | -        | -              | V100   | 79.32 |         80.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404-b92cfdd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404.log.json)                                 |
+| DeepLabV3        | R-101-D8        | 512x1024  |   80000 | -        | -              | V100   | 80.20 |         81.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503-9e428899.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503.log.json)                             |
+| DeepLabV3 (FP16) | R-101-D8        | 512x1024  |   80000 | 5.75     | 3.86           | V100   | 80.48 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-774d9cec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json)         |
+| DeepLabV3        | R-18-D8         | 769x769   |   80000 | 1.9      | 5.55           | V100   | 76.60 |         78.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes_20201225_021506-6452126a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes-20201225_021506.log.json)                                     |
+| DeepLabV3        | R-50-D8         | 769x769   |   80000 | -        | -              | V100   | 79.89 |         81.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338-788d6228.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338.log.json)                                     |
+| DeepLabV3        | R-101-D8        | 769x769   |   80000 | -        | -              | V100   | 79.67 |         80.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353-60e95418.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353.log.json)                                 |
+| DeepLabV3        | R-101-D16-MG124 | 512x1024  |   40000 | 4.7      | 6.96           | V100   | 76.71 |         78.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-67b0c992.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json) |
+| DeepLabV3        | R-101-D16-MG124 | 512x1024  |   80000 | -        | -              | V100   | 78.36 |         79.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-57bb8425.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json) |
+| DeepLabV3        | R-18b-D8        | 512x1024  |   80000 | 1.6      | 13.93          | V100   | 76.26 |         77.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes_20201225_094144-46040cef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes-20201225_094144.log.json)                             |
+| DeepLabV3        | R-50b-D8        | 512x1024  |   80000 | 6.0      | 2.74           | V100   | 79.63 |         80.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes_20201225_155148-ec368954.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes-20201225_155148.log.json)                             |
+| DeepLabV3        | R-101b-D8       | 512x1024  |   80000 | 9.5      | 1.81           | V100   | 80.01 |         81.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes_20201226_171821-8fd49503.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes-20201226_171821.log.json)                         |
+| DeepLabV3        | R-18b-D8        | 769x769   |   80000 | 1.8      | 5.79           | V100   | 75.63 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes_20201225_094144-fdc985d9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes-20201225_094144.log.json)                                 |
+| DeepLabV3        | R-50b-D8        | 769x769   |   80000 | 6.8      | 1.16           | V100   | 78.80 |         80.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes_20201225_155404-87fb0cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes-20201225_155404.log.json)                                 |
+| DeepLabV3        | R-101b-D8       | 769x769   |   80000 | 10.7     | 0.82           | V100   | 79.41 |         80.73 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes_20201226_190843-9142ee57.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes-20201226_190843.log.json)                             |
 
 ### ADE20K
 
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                   |
-| --------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3 | R-50-D8  | 512x512   |   80000 | 8.9      | 14.76          | 42.42 |         43.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028-0bb3f844.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028.log.json)         |
-| DeepLabV3 | R-101-D8 | 512x512   |   80000 | 12.4     | 10.14          | 44.08 |         45.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256-d89c7fa4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256.log.json)     |
-| DeepLabV3 | R-50-D8  | 512x512   |  160000 | -        | -              | 42.66 |         44.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227-5d0ee427.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227.log.json)     |
-| DeepLabV3 | R-101-D8 | 512x512   |  160000 | -        | -              | 45.00 |         46.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816-b1f72b3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816.log.json) |
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                   |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-50-D8  | 512x512   |   80000 | 8.9      | 14.76          | V100   | 42.42 |         43.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028-0bb3f844.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028.log.json)         |
+| DeepLabV3 | R-101-D8 | 512x512   |   80000 | 12.4     | 10.14          | V100   | 44.08 |         45.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256-d89c7fa4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256.log.json)     |
+| DeepLabV3 | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.66 |         44.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227-5d0ee427.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 45.00 |         46.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816-b1f72b3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816.log.json) |
 
 ### Pascal VOC 2012 + Aug
 
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                       |
-| --------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3 | R-50-D8  | 512x512   |   20000 | 6.1      | 13.88          | 76.17 |         77.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906-596905ef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906.log.json)     |
-| DeepLabV3 | R-101-D8 | 512x512   |   20000 | 9.6      | 9.81           | 78.70 |         79.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932-8d13832f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932.log.json) |
-| DeepLabV3 | R-50-D8  | 512x512   |   40000 | -        | -              | 77.68 |         78.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546-2ae96e7e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546.log.json)     |
-| DeepLabV3 | R-101-D8 | 512x512   |   40000 | -        | -              | 77.92 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432-0017d784.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432.log.json) |
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                       |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-50-D8  | 512x512   |   20000 | 6.1      | 13.88          | V100   | 76.17 |         77.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906-596905ef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |   20000 | 9.6      | 9.81           | V100   | 78.70 |         79.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932-8d13832f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932.log.json) |
+| DeepLabV3 | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 77.68 |         78.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546-2ae96e7e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 77.92 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432-0017d784.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432.log.json) |
 
 ### Pascal Context
 
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                               | download                                                                                                                                                                                                                                                                                                                                                                               |
-| --------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3 | R-101-D8 | 480x480   |   40000 | 9.2      | 7.09           | 46.55 |         47.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context_20200911_204118-1aa27336.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context-20200911_204118.log.json) |
-| DeepLabV3 | R-101-D8 | 480x480   |   80000 | -        | -              | 46.42 |         47.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context_20200911_170155-2a21fff3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context-20200911_170155.log.json) |
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                               |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-101-D8 | 480x480   |   40000 | 9.2      | 7.09           | V100   | 46.55 |         47.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context_20200911_204118-1aa27336.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context-20200911_204118.log.json) |
+| DeepLabV3 | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 46.42 |         47.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context_20200911_170155-2a21fff3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context-20200911_170155.log.json) |
 
 ### Pascal Context 59
 
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                                           |
-| --------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3 | R-101-D8 | 480x480   |   40000 | -        | -              | 52.61 |         54.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59_20210416_110332-cb08ea46.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59-20210416_110332.log.json) |
-| DeepLabV3 | R-101-D8 | 480x480   |   80000 | -        | -              | 52.46 |         54.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59_20210416_113002-26303993.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59-20210416_113002.log.json) |
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                                                           |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-101-D8 | 480x480   |   40000 | -        | -              | V100   | 52.61 |         54.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59_20210416_110332-cb08ea46.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59-20210416_110332.log.json) |
+| DeepLabV3 | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 52.46 |         54.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59_20210416_113002-26303993.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59-20210416_113002.log.json) |
 
 ### COCO-Stuff 10k
 
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                                           |
-| --------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3 | R-50-D8  | 512x512   |   20000 | 9.6      | 10.8           | 34.66 |         36.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-b35f789d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json)     |
-| DeepLabV3 | R-101-D8 | 512x512   |   20000 | 13.2     | 8.7            | 37.30 |         38.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-c49752cb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json) |
-| DeepLabV3 | R-50-D8  | 512x512   |   40000 | -        | -              | 35.73 |         37.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-dc76f3ff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json)     |
-| DeepLabV3 | R-101-D8 | 512x512   |   40000 | -        | -              | 37.81 |         38.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-636cb433.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json) |
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                                                                           |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-50-D8  | 512x512   |   20000 | 9.6      | 10.8           | V100   | 34.66 |         36.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-b35f789d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |   20000 | 13.2     | 8.7            | V100   | 37.30 |         38.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-c49752cb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json) |
+| DeepLabV3 | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 35.73 |         37.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-dc76f3ff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 37.81 |         38.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-636cb433.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json) |
 
 ### COCO-Stuff 164k
 
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                                                   |
-| --------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3 | R-50-D8  | 512x512   |   80000 | 9.6      | 10.8           | 39.38 |         40.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016-88675c24.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016.log.json)         |
-| DeepLabV3 | R-101-D8 | 512x512   |   80000 | 13.2     | 8.7            | 40.87 |         41.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252-13600dc2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252.log.json)     |
-| DeepLabV3 | R-50-D8  | 512x512   |  160000 | -        | -              | 41.09 |         41.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016-49f2812b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016.log.json)     |
-| DeepLabV3 | R-101-D8 | 512x512   |  160000 | -        | -              | 41.82 |         42.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402-f035acfd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402.log.json) |
-| DeepLabV3 | R-50-D8  | 512x512   |  320000 | -        | -              | 41.37 |         42.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403-51b21115.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403.log.json)     |
-| DeepLabV3 | R-101-D8 | 512x512   |  320000 | -        | -              | 42.61 |         43.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402-3cbca14d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402.log.json) |
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                   |
+| --------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3 | R-50-D8  | 512x512   |   80000 | 9.6      | 10.8           | V100   | 39.38 |         40.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016-88675c24.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016.log.json)         |
+| DeepLabV3 | R-101-D8 | 512x512   |   80000 | 13.2     | 8.7            | V100   | 40.87 |         41.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252-13600dc2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252.log.json)     |
+| DeepLabV3 | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 41.09 |         41.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016-49f2812b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 41.82 |         42.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402-f035acfd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402.log.json) |
+| DeepLabV3 | R-50-D8  | 512x512   |  320000 | -        | -              | V100   | 41.37 |         42.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403-51b21115.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403.log.json)     |
+| DeepLabV3 | R-101-D8 | 512x512   |  320000 | -        | -              | V100   | 42.61 |         43.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402-3cbca14d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402.log.json) |
 
 Note:
 
 - `D-8` here corresponding to the output stride 8 setting for DeepLab series.
 - `FP16` means Mixed Precision (FP16) is adopted in training.
+
+## Citation
+
+```bibtext
+@article{chen2017rethinking,
+  title={Rethinking atrous convolution for semantic image segmentation},
+  author={Chen, Liang-Chieh and Papandreou, George and Schroff, Florian and Adam, Hartwig},
+  journal={arXiv preprint arXiv:1706.05587},
+  year={2017}
+}
+```
diff --git a/configs/deeplabv3/deeplabv3.yml b/configs/deeplabv3/deeplabv3.yml
deleted file mode 100644
index 559af4f69c..0000000000
--- a/configs/deeplabv3/deeplabv3.yml
+++ /dev/null
@@ -1,756 +0,0 @@
-Collections:
-- Name: DeepLabV3
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-    - Pascal Context
-    - Pascal Context 59
-    - COCO-Stuff 10k
-    - COCO-Stuff 164k
-  Paper:
-    URL: https://arxiv.org/abs/1706.05587
-    Title: Rethinking atrous convolution for semantic image segmentation
-  README: configs/deeplabv3/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/tensorflow/models/tree/master/research/deeplab
-Models:
-- Name: deeplabv3_r50-d8_512x1024_40k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 389.11
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.09
-      mIoU(ms+flip): 80.45
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449-acadc2f8.pth
-- Name: deeplabv3_r101-d8_512x1024_40k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 520.83
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.12
-      mIoU(ms+flip): 79.61
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241-7fd3f799.pth
-- Name: deeplabv3_r50-d8_769x769_40k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 900.9
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.58
-      mIoU(ms+flip): 79.89
-  Config: configs/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723-7eda553c.pth
-- Name: deeplabv3_r101-d8_769x769_40k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 1204.82
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.27
-      mIoU(ms+flip): 80.11
-  Config: configs/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809-c64f889f.pth
-- Name: deeplabv3_r18-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 72.57
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 1.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.7
-      mIoU(ms+flip): 78.27
-  Config: configs/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes_20201225_021506-23dffbe2.pth
-- Name: deeplabv3_r50-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.32
-      mIoU(ms+flip): 80.57
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404-b92cfdd4.pth
-- Name: deeplabv3_r101-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.2
-      mIoU(ms+flip): 81.21
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503-9e428899.pth
-- Name: deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 259.07
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (512,1024)
-    Training Memory (GB): 5.75
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.48
-  Config: configs/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-774d9cec.pth
-- Name: deeplabv3_r18-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-18-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 180.18
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 1.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.6
-      mIoU(ms+flip): 78.26
-  Config: configs/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes_20201225_021506-6452126a.pth
-- Name: deeplabv3_r50-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.89
-      mIoU(ms+flip): 81.06
-  Config: configs/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338-788d6228.pth
-- Name: deeplabv3_r101-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.67
-      mIoU(ms+flip): 80.81
-  Config: configs/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353-60e95418.pth
-- Name: deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D16-MG124
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.36
-      mIoU(ms+flip): 79.84
-  Config: configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-57bb8425.pth
-- Name: deeplabv3_r18b-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-18b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 71.79
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 1.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.26
-      mIoU(ms+flip): 77.88
-  Config: configs/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes_20201225_094144-46040cef.pth
-- Name: deeplabv3_r50b-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 364.96
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.63
-      mIoU(ms+flip): 80.98
-  Config: configs/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes_20201225_155148-ec368954.pth
-- Name: deeplabv3_r101b-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 552.49
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.01
-      mIoU(ms+flip): 81.21
-  Config: configs/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes_20201226_171821-8fd49503.pth
-- Name: deeplabv3_r18b-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-18b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 172.71
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 1.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.63
-      mIoU(ms+flip): 77.51
-  Config: configs/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes_20201225_094144-fdc985d9.pth
-- Name: deeplabv3_r50b-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 862.07
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.8
-      mIoU(ms+flip): 80.27
-  Config: configs/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes_20201225_155404-87fb0cf4.pth
-- Name: deeplabv3_r101b-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 1219.51
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.41
-      mIoU(ms+flip): 80.73
-  Config: configs/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes_20201226_190843-9142ee57.pth
-- Name: deeplabv3_r50-d8_512x512_80k_ade20k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 67.75
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.42
-      mIoU(ms+flip): 43.28
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028-0bb3f844.pth
-- Name: deeplabv3_r101-d8_512x512_80k_ade20k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 98.62
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.08
-      mIoU(ms+flip): 45.19
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256-d89c7fa4.pth
-- Name: deeplabv3_r50-d8_512x512_160k_ade20k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.66
-      mIoU(ms+flip): 44.09
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227-5d0ee427.pth
-- Name: deeplabv3_r101-d8_512x512_160k_ade20k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.0
-      mIoU(ms+flip): 46.66
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816-b1f72b3b.pth
-- Name: deeplabv3_r50-d8_512x512_20k_voc12aug
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 72.05
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.17
-      mIoU(ms+flip): 77.42
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906-596905ef.pth
-- Name: deeplabv3_r101-d8_512x512_20k_voc12aug
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 101.94
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 78.7
-      mIoU(ms+flip): 79.95
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932-8d13832f.pth
-- Name: deeplabv3_r50-d8_512x512_40k_voc12aug
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.68
-      mIoU(ms+flip): 78.78
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546-2ae96e7e.pth
-- Name: deeplabv3_r101-d8_512x512_40k_voc12aug
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.92
-      mIoU(ms+flip): 79.18
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432-0017d784.pth
-- Name: deeplabv3_r101-d8_480x480_40k_pascal_context
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 141.04
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (480,480)
-    Training Memory (GB): 9.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 46.55
-      mIoU(ms+flip): 47.81
-  Config: configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context_20200911_204118-1aa27336.pth
-- Name: deeplabv3_r101-d8_480x480_80k_pascal_context
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 46.42
-      mIoU(ms+flip): 47.53
-  Config: configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context_20200911_170155-2a21fff3.pth
-- Name: deeplabv3_r101-d8_480x480_40k_pascal_context_59
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 52.61
-      mIoU(ms+flip): 54.28
-  Config: configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59_20210416_110332-cb08ea46.pth
-- Name: deeplabv3_r101-d8_480x480_80k_pascal_context_59
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 52.46
-      mIoU(ms+flip): 54.09
-  Config: configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59_20210416_113002-26303993.pth
-- Name: deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 92.59
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 10k
-    Metrics:
-      mIoU: 34.66
-      mIoU(ms+flip): 36.08
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-b35f789d.pth
-- Name: deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 114.94
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 13.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 10k
-    Metrics:
-      mIoU: 37.3
-      mIoU(ms+flip): 38.42
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-c49752cb.pth
-- Name: deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 10k
-    Metrics:
-      mIoU: 35.73
-      mIoU(ms+flip): 37.09
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-dc76f3ff.pth
-- Name: deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 10k
-    Metrics:
-      mIoU: 37.81
-      mIoU(ms+flip): 38.8
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-636cb433.pth
-- Name: deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 92.59
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 39.38
-      mIoU(ms+flip): 40.03
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016-88675c24.pth
-- Name: deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 114.94
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 13.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 40.87
-      mIoU(ms+flip): 41.5
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252-13600dc2.pth
-- Name: deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 41.09
-      mIoU(ms+flip): 41.69
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016-49f2812b.pth
-- Name: deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 41.82
-      mIoU(ms+flip): 42.49
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402-f035acfd.pth
-- Name: deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 320000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 41.37
-      mIoU(ms+flip): 42.22
-  Config: configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403-51b21115.pth
-- Name: deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 320000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 42.61
-      mIoU(ms+flip): 43.42
-  Config: configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402-3cbca14d.pth
diff --git a/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..b9f3c178df
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(
+        depth=101,
+        dilations=(1, 1, 1, 2),
+        strides=(1, 2, 2, 1),
+        multi_grid=(1, 2, 4)),
+    decode_head=dict(
+        dilations=(1, 6, 12, 18),
+        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..da3a88f998
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(
+        depth=101,
+        dilations=(1, 1, 1, 2),
+        strides=(1, 2, 2, 1),
+        multi_grid=(1, 2, 4)),
+    decode_head=dict(
+        dilations=(1, 6, 12, 18),
+        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes.py
deleted file mode 100644
index f20f260e23..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet101_v1c',
-    backbone=dict(
-        depth=101,
-        dilations=(1, 1, 1, 2),
-        strides=(1, 2, 2, 1),
-        multi_grid=(1, 2, 4)),
-    decode_head=dict(
-        dilations=(1, 6, 12, 18),
-        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes.py
deleted file mode 100644
index de4a8a5e9f..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet101_v1c',
-    backbone=dict(
-        depth=101,
-        dilations=(1, 1, 1, 2),
-        strides=(1, 2, 2, 1),
-        multi_grid=(1, 2, 4)),
-    decode_head=dict(
-        dilations=(1, 6, 12, 18),
-        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context.py b/configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context.py
deleted file mode 100644
index 0b5256f7b7..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_480x480_40k_pascal_context.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59.py b/configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59.py
deleted file mode 100644
index 4874121fd0..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_480x480_40k_pascal_context_59.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context.py b/configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context.py
deleted file mode 100644
index 001b7a69c1..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_480x480_80k_pascal_context.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59.py b/configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59.py
deleted file mode 100644
index 032dc8b621..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_480x480_80k_pascal_context_59.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..d01803ce1f
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..7964b51446
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..1d1a6201a0
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..78205468d7
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..84174166ce
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = './deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py'
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=optimizer,
+    loss_scale=512.)
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..0ed6eee833
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..add008345f
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000..349cc88f0a
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..1c527e0c53
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..ea27bedc04
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000..a43a786e0e
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000..8879d5394f
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..54671d4dc6
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..1b2635d1c2
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..b7bb0b6448
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..2d4f6f747b
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000..9d64ca29fe
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-80k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..54671d4dc6
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index 8c707c79d6..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 6804a57813..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index df6f36ef7c..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 40f5f62373..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index fb2be22f8b..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k.py
deleted file mode 100644
index 76b124248e..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k.py
deleted file mode 100644
index d476c66f4e..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k.py
deleted file mode 100644
index 50669c864a..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k.py
deleted file mode 100644
index 37d09cf994..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k.py
deleted file mode 100644
index a0eb3ddfed..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k.py b/configs/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 796ba3fb14..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index e6d58a67b3..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 13094a98ee..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes.py
deleted file mode 100644
index 096c55b640..0000000000
--- a/configs/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,7 +0,0 @@
-_base_ = './deeplabv3_r101-d8_512x1024_80k_cityscapes.py'
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=optimizer,
-    loss_scale=512.)
diff --git a/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..708932da85
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..a0f634d081
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 5186bf614b..0000000000
--- a/configs/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index d185db95ad..0000000000
--- a/configs/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './deeplabv3_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..bc353bb564
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..021c98c376
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index e084e95c70..0000000000
--- a/configs/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index a990c07653..0000000000
--- a/configs/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './deeplabv3_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..c747cd74a2
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..6506abf696
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index b25e725ed9..0000000000
--- a/configs/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet18',
-    backbone=dict(type='ResNet', depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index fd920f0ca7..0000000000
--- a/configs/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './deeplabv3_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet18',
-    backbone=dict(type='ResNet', depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_480x480_40k_pascal_context.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-480x480.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_480x480_40k_pascal_context.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-480x480.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_480x480_40k_pascal_context_59.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_480x480_40k_pascal_context_59.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_pascal-context-59-480x480.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_480x480_80k_pascal_context.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-480x480.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_480x480_80k_pascal_context.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-480x480.py
diff --git a/configs/deeplabv3/deeplabv3_r50-d8_480x480_80k_pascal_context_59.py b/configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/deeplabv3/deeplabv3_r50-d8_480x480_80k_pascal_context_59.py
rename to configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_pascal-context-59-480x480.py
diff --git a/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..818519f263
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..07a234be75
--- /dev/null
+++ b/configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index e742d9a5ec..0000000000
--- a/configs/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes.py b/configs/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 332d9cfb79..0000000000
--- a/configs/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/deeplabv3/metafile.yaml b/configs/deeplabv3/metafile.yaml
new file mode 100644
index 0000000000..650f7d695d
--- /dev/null
+++ b/configs/deeplabv3/metafile.yaml
@@ -0,0 +1,985 @@
+Collections:
+- Name: DeepLabV3
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+    - Pascal Context
+    - Pascal Context 59
+    - COCO-Stuff 10k
+    - COCO-Stuff 164k
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  README: configs/deeplabv3/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.09
+      mIoU(ms+flip): 80.45
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449-acadc2f8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.12
+      mIoU(ms+flip): 79.61
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241-7fd3f799.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_40k_cityscapes/deeplabv3_r101-d8_512x1024_40k_cityscapes_20200605_012241.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.58
+      mIoU(ms+flip): 79.89
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723-7eda553c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_40k_cityscapes/deeplabv3_r50-d8_769x769_40k_cityscapes_20200606_113723.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.27
+      mIoU(ms+flip): 80.11
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809-c64f889f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_40k_cityscapes/deeplabv3_r101-d8_769x769_40k_cityscapes_20200606_113809.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.7
+      mIoU(ms+flip): 78.27
+  Config: configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes_20201225_021506-23dffbe2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes/deeplabv3_r18-d8_512x1024_80k_cityscapes-20201225_021506.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.32
+      mIoU(ms+flip): 80.57
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404-b92cfdd4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_80k_cityscapes/deeplabv3_r50-d8_512x1024_80k_cityscapes_20200606_113404.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.2
+      mIoU(ms+flip): 81.21
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503-9e428899.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes/deeplabv3_r101-d8_512x1024_80k_cityscapes_20200606_113503.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.48
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    - (FP16)
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.75
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-774d9cec.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.6
+      mIoU(ms+flip): 78.26
+  Config: configs/deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes_20201225_021506-6452126a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18-d8_769x769_80k_cityscapes/deeplabv3_r18-d8_769x769_80k_cityscapes-20201225_021506.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.89
+      mIoU(ms+flip): 81.06
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338-788d6228.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_769x769_80k_cityscapes/deeplabv3_r50-d8_769x769_80k_cityscapes_20200606_221338.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.67
+      mIoU(ms+flip): 80.81
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353-60e95418.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_769x769_80k_cityscapes/deeplabv3_r101-d8_769x769_80k_cityscapes_20200607_013353.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.71
+      mIoU(ms+flip): 78.63
+  Config: configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16-MG124
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-67b0c992.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.36
+      mIoU(ms+flip): 79.84
+  Config: configs/deeplabv3/deeplabv3_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16-MG124
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-57bb8425.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.26
+      mIoU(ms+flip): 77.88
+  Config: configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes_20201225_094144-46040cef.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_512x1024_80k_cityscapes/deeplabv3_r18b-d8_512x1024_80k_cityscapes-20201225_094144.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.63
+      mIoU(ms+flip): 80.98
+  Config: configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes_20201225_155148-ec368954.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_512x1024_80k_cityscapes/deeplabv3_r50b-d8_512x1024_80k_cityscapes-20201225_155148.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.01
+      mIoU(ms+flip): 81.21
+  Config: configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes_20201226_171821-8fd49503.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_512x1024_80k_cityscapes/deeplabv3_r101b-d8_512x1024_80k_cityscapes-20201226_171821.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.63
+      mIoU(ms+flip): 77.51
+  Config: configs/deeplabv3/deeplabv3_r18b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes_20201225_094144-fdc985d9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r18b-d8_769x769_80k_cityscapes/deeplabv3_r18b-d8_769x769_80k_cityscapes-20201225_094144.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.8
+      mIoU(ms+flip): 80.27
+  Config: configs/deeplabv3/deeplabv3_r50b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes_20201225_155404-87fb0cf4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50b-d8_769x769_80k_cityscapes/deeplabv3_r50b-d8_769x769_80k_cityscapes-20201225_155404.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.41
+      mIoU(ms+flip): 80.73
+  Config: configs/deeplabv3/deeplabv3_r101b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes_20201226_190843-9142ee57.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101b-d8_769x769_80k_cityscapes/deeplabv3_r101b-d8_769x769_80k_cityscapes-20201226_190843.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.42
+      mIoU(ms+flip): 43.28
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028-0bb3f844.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_80k_ade20k/deeplabv3_r50-d8_512x512_80k_ade20k_20200614_185028.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.08
+      mIoU(ms+flip): 45.19
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256-d89c7fa4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_80k_ade20k/deeplabv3_r101-d8_512x512_80k_ade20k_20200615_021256.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.66
+      mIoU(ms+flip): 44.09
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227-5d0ee427.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_160k_ade20k/deeplabv3_r50-d8_512x512_160k_ade20k_20200615_123227.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.0
+      mIoU(ms+flip): 46.66
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816-b1f72b3b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k/deeplabv3_r101-d8_512x512_160k_ade20k_20200615_105816.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.17
+      mIoU(ms+flip): 77.42
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906-596905ef.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.7
+      mIoU(ms+flip): 79.95
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932-8d13832f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_20k_voc12aug/deeplabv3_r101-d8_512x512_20k_voc12aug_20200617_010932.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.68
+      mIoU(ms+flip): 78.78
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546-2ae96e7e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_40k_voc12aug/deeplabv3_r50-d8_512x512_40k_voc12aug_20200613_161546.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.92
+      mIoU(ms+flip): 79.18
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432-0017d784.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_40k_voc12aug/deeplabv3_r101-d8_512x512_40k_voc12aug_20200613_161432.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 46.55
+      mIoU(ms+flip): 47.81
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context_20200911_204118-1aa27336.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context/deeplabv3_r101-d8_480x480_40k_pascal_context-20200911_204118.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 46.42
+      mIoU(ms+flip): 47.53
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context_20200911_170155-2a21fff3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context/deeplabv3_r101-d8_480x480_80k_pascal_context-20200911_170155.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.61
+      mIoU(ms+flip): 54.28
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59_20210416_110332-cb08ea46.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_40k_pascal_context_59/deeplabv3_r101-d8_480x480_40k_pascal_context_59-20210416_110332.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.46
+      mIoU(ms+flip): 54.09
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59_20210416_113002-26303993.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_480x480_80k_pascal_context_59/deeplabv3_r101-d8_480x480_80k_pascal_context_59-20210416_113002.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 34.66
+      mIoU(ms+flip): 36.08
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-b35f789d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 37.3
+      mIoU(ms+flip): 38.42
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025-c49752cb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_20k_coco-stuff10k_20210821_043025.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 35.73
+      mIoU(ms+flip): 37.09
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-dc76f3ff.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 37.81
+      mIoU(ms+flip): 38.8
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305-636cb433.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k/deeplabv3_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_043305.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 39.38
+      mIoU(ms+flip): 40.03
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016-88675c24.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_80k_coco-stuff164k_20210709_163016.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 40.87
+      mIoU(ms+flip): 41.5
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252-13600dc2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_80k_coco-stuff164k_20210709_201252.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.09
+      mIoU(ms+flip): 41.69
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016-49f2812b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_160k_coco-stuff164k_20210709_163016.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.82
+      mIoU(ms+flip): 42.49
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402-f035acfd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_160k_coco-stuff164k_20210709_155402.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.37
+      mIoU(ms+flip): 42.22
+  Config: configs/deeplabv3/deeplabv3_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403-51b21115.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r50-d8_512x512_4x4_320k_coco-stuff164k_20210709_155403.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
+- Name: deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 42.61
+      mIoU(ms+flip): 43.42
+  Config: configs/deeplabv3/deeplabv3_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402-3cbca14d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k/deeplabv3_r101-d8_512x512_4x4_320k_coco-stuff164k_20210709_155402.log.json
+  Paper:
+    Title: Rethinking atrous convolution for semantic image segmentation
+    URL: https://arxiv.org/abs/1706.05587
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/aspp_head.py#L54
+  Framework: PyTorch
diff --git a/configs/deeplabv3plus/README.md b/configs/deeplabv3plus/README.md
index 86b8bfb43d..04d01fa512 100644
--- a/configs/deeplabv3plus/README.md
+++ b/configs/deeplabv3plus/README.md
@@ -1,6 +1,6 @@
 # DeepLabV3+
 
-[Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611)
+> [Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611)
 
 ## Introduction
 
@@ -22,107 +22,102 @@ Spatial pyramid pooling module or encode-decoder structure are used in deep neur
 <img src="https://user-images.githubusercontent.com/24582831/142900680-3e2c3098-8341-4760-bbfd-b1d7d29968ea.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@inproceedings{deeplabv3plus2018,
-  title={Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation},
-  author={Liang-Chieh Chen and Yukun Zhu and George Papandreou and Florian Schroff and Hartwig Adam},
-  booktitle={ECCV},
-  year={2018}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method            | Backbone        | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                                       |
-| ----------------- | --------------- | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| DeepLabV3+        | R-50-D8         | 512x1024  |   40000 | 7.5      | 3.94           | 79.61 |         81.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610-d222ffcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610.log.json)                                 |
-| DeepLabV3+        | R-101-D8        | 512x1024  |   40000 | 11       | 2.60           | 80.21 |         81.82 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614-3769eecf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614.log.json)                             |
-| DeepLabV3+        | R-50-D8         | 769x769   |   40000 | 8.5      | 1.72           | 78.97 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143-1dcb0e3c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143.log.json)                                     |
-| DeepLabV3+        | R-101-D8        | 769x769   |   40000 | 12.5     | 1.15           | 79.46 |         80.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304-ff414b9e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304.log.json)                                 |
-| DeepLabV3+        | R-18-D8         | 512x1024  |   80000 | 2.2      | 14.27          | 76.89 |         78.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes-20201226_080942.log.json)                                 |
-| DeepLabV3+        | R-50-D8         | 512x1024  |   80000 | -        | -              | 80.09 |         81.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049-f9fb496d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049.log.json)                                 |
-| DeepLabV3+        | R-101-D8        | 512x1024  |   80000 | -        | -              | 80.97 |         82.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143-068fcfe9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143.log.json)                             |
-| DeepLabV3+ (FP16) | R-101-D8        | 512x1024  |   80000 | 6.35     | 7.87           | 80.46 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-f1104f4b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json)         |
-| DeepLabV3+        | R-18-D8         | 769x769   |   80000 | 2.5      | 5.74           | 76.26 |         77.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes_20201226_083346-f326e06a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes-20201226_083346.log.json)                                     |
-| DeepLabV3+        | R-50-D8         | 769x769   |   80000 | -        | -              | 79.83 |         81.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233-0e9dfdc4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233.log.json)                                     |
-| DeepLabV3+        | R-101-D8        | 769x769   |   80000 | -        | -              | 80.65 |         81.47 | [config<sup>\[1\]</sup>](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720-dfcc0b68.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720.log.json)                                 |
-| DeepLabV3+        | R-101-D16-MG124 | 512x1024  |   40000 | 5.8      | 7.48           | 79.09 |         80.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-cf9ce186.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json) |
-| DeepLabV3+        | R-101-D16-MG124 | 512x1024  |   80000 | 9.9      | -              | 79.90 |         81.33 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-ee6158e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json) |
-| DeepLabV3+        | R-18b-D8        | 512x1024  |   80000 | 2.1      | 14.95          | 75.87 |         77.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes_20201226_090828-e451abd9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes-20201226_090828.log.json)                             |
-| DeepLabV3+        | R-50b-D8        | 512x1024  |   80000 | 7.4      | 3.94           | 80.28 |         81.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes_20201225_213645-a97e4e43.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes-20201225_213645.log.json)                             |
-| DeepLabV3+        | R-101b-D8       | 512x1024  |   80000 | 10.9     | 2.60           | 80.16 |         81.41 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes.py)               | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes_20201226_190843-9c3c93a4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes-20201226_190843.log.json)                         |
-| DeepLabV3+        | R-18b-D8        | 769x769   |   80000 | 2.4      | 5.96           | 76.36 |         78.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes_20201226_151312-2c868aff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes-20201226_151312.log.json)                                 |
-| DeepLabV3+        | R-50b-D8        | 769x769   |   80000 | 8.4      | 1.72           | 79.41 |         80.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes_20201225_224655-8b596d1c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes-20201225_224655.log.json)                                 |
-| DeepLabV3+        | R-101b-D8       | 769x769   |   80000 | 12.3     | 1.10           | 79.88 |         81.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes_20201226_205041-227cdf7c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes-20201226_205041.log.json)                             |
+| Method            | Backbone        | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                                                       |
+| ----------------- | --------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DeepLabV3+        | R-50-D8         | 512x1024  |   40000 | 7.5      | 3.94           | V100   | 79.61 |         81.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610-d222ffcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610.log.json)                                 |
+| DeepLabV3+        | R-101-D8        | 512x1024  |   40000 | 11       | 2.60           | V100   | 80.21 |         81.82 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614-3769eecf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614.log.json)                             |
+| DeepLabV3+        | R-50-D8         | 769x769   |   40000 | 8.5      | 1.72           | V100   | 78.97 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143-1dcb0e3c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143.log.json)                                     |
+| DeepLabV3+        | R-101-D8        | 769x769   |   40000 | 12.5     | 1.15           | V100   | 79.46 |         80.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304-ff414b9e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304.log.json)                                 |
+| DeepLabV3+        | R-18-D8         | 512x1024  |   80000 | 2.2      | 14.27          | V100   | 76.89 |         78.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes-20201226_080942.log.json)                                 |
+| DeepLabV3+        | R-50-D8         | 512x1024  |   80000 | -        | -              | V100   | 80.09 |         81.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049-f9fb496d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049.log.json)                                 |
+| DeepLabV3+        | R-101-D8        | 512x1024  |   80000 | -        | -              | V100   | 80.97 |         82.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143-068fcfe9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143.log.json)                             |
+| DeepLabV3+ (FP16) | R-101-D8        | 512x1024  |   80000 | 6.35     | 7.87           | V100   | 80.46 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-f1104f4b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json)         |
+| DeepLabV3+        | R-18-D8         | 769x769   |   80000 | 2.5      | 5.74           | V100   | 76.26 |         77.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes_20201226_083346-f326e06a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes-20201226_083346.log.json)                                     |
+| DeepLabV3+        | R-50-D8         | 769x769   |   80000 | -        | -              | V100   | 79.83 |         81.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py)                  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233-0e9dfdc4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233.log.json)                                     |
+| DeepLabV3+        | R-101-D8        | 769x769   |   80000 | -        | -              | V100   | 80.65 |         81.47 | [config<sup>\[1\]</sup>](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720-dfcc0b68.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720.log.json)                                 |
+| DeepLabV3+        | R-101-D16-MG124 | 512x1024  |   40000 | 5.8      | 7.48           | V100   | 79.09 |         80.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/ddeeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-cf9ce186.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json) |
+| DeepLabV3+        | R-101-D16-MG124 | 512x1024  |   80000 | 9.9      | -              | V100   | 79.90 |         81.33 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-ee6158e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json) |
+| DeepLabV3+        | R-18b-D8        | 512x1024  |   80000 | 2.1      | 14.95          | V100   | 75.87 |         77.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes_20201226_090828-e451abd9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes-20201226_090828.log.json)                             |
+| DeepLabV3+        | R-50b-D8        | 512x1024  |   80000 | 7.4      | 3.94           | V100   | 80.28 |         81.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes_20201225_213645-a97e4e43.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes-20201225_213645.log.json)                             |
+| DeepLabV3+        | R-101b-D8       | 512x1024  |   80000 | 10.9     | 2.60           | V100   | 80.16 |         81.41 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py)               | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes_20201226_190843-9c3c93a4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes-20201226_190843.log.json)                         |
+| DeepLabV3+        | R-18b-D8        | 769x769   |   80000 | 2.4      | 5.96           | V100   | 76.36 |         78.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes_20201226_151312-2c868aff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes-20201226_151312.log.json)                                 |
+| DeepLabV3+        | R-50b-D8        | 769x769   |   80000 | 8.4      | 1.72           | V100   | 79.41 |         80.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes_20201225_224655-8b596d1c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes-20201225_224655.log.json)                                 |
+| DeepLabV3+        | R-101b-D8       | 769x769   |   80000 | 12.3     | 1.10           | V100   | 79.88 |         81.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py)                | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes_20201226_205041-227cdf7c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes-20201226_205041.log.json)                             |
 
 \[1\] The training of the model is sensitive to random seed, and the seed to train it is 1111.
 
 ### ADE20K
 
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                           |
-| ---------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 10.6     | 21.01          | 42.72 |         43.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028-bf1400d8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028.log.json)         |
-| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 14.1     | 14.16          | 44.60 |         46.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139-d5730af7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139.log.json)     |
-| DeepLabV3+ | R-50-D8  | 512x512   |  160000 | -        | -              | 43.95 |         44.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504-6135c7e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504.log.json)     |
-| DeepLabV3+ | R-101-D8 | 512x512   |  160000 | -        | -              | 45.47 |         46.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232-38ed86bb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232.log.json) |
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                           |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 10.6     | 21.01          | V100   | 42.72 |         43.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028-bf1400d8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028.log.json)         |
+| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 14.1     | 14.16          | V100   | 44.60 |         46.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139-d5730af7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139.log.json)     |
+| DeepLabV3+ | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 43.95 |         44.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504-6135c7e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 45.47 |         46.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232-38ed86bb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232.log.json) |
 
 ### Pascal VOC 2012 + Aug
 
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                                                               |
-| ---------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3+ | R-50-D8  | 512x512   |   20000 | 7.6      | 21             | 75.93 |         77.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323-aad58ef1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323.log.json)     |
-| DeepLabV3+ | R-101-D8 | 512x512   |   20000 | 11       | 13.88          | 77.22 |         78.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345-c7ff3d56.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345.log.json) |
-| DeepLabV3+ | R-50-D8  | 512x512   |   40000 | -        | -              | 76.81 |         77.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759-e1b43aa9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759.log.json)     |
-| DeepLabV3+ | R-101-D8 | 512x512   |   40000 | -        | -              | 78.62 |         79.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333-faf03387.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333.log.json) |
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-50-D8  | 512x512   |   20000 | 7.6      | 21             | V100   | 75.93 |         77.50 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323-aad58ef1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   20000 | 11       | 13.88          | V100   | 77.22 |         78.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345-c7ff3d56.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345.log.json) |
+| DeepLabV3+ | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.81 |         77.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759-e1b43aa9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 78.62 |         79.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333-faf03387.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333.log.json) |
 
 ### Pascal Context
 
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                                                                                       |
-| ---------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3+ | R-101-D8 | 480x480   |   40000 | -        | 9.09           | 47.30 |         48.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context_20200911_165459-d3c8a29e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context-20200911_165459.log.json) |
-| DeepLabV3+ | R-101-D8 | 480x480   |   80000 | -        | -              | 47.23 |         48.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context_20200911_155322-145d3ee8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context-20200911_155322.log.json) |
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                                                       |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-101-D8 | 480x480   |   40000 | -        | 9.09           | V100   | 47.30 |         48.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context_20200911_165459-d3c8a29e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context-20200911_165459.log.json) |
+| DeepLabV3+ | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 47.23 |         48.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context_20200911_155322-145d3ee8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context-20200911_155322.log.json) |
 
 ### Pascal Context 59
 
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                                                                   |
-| ---------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3+ | R-101-D8 | 480x480   |   40000 | -        | -              | 52.86 |         54.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59_20210416_111233-ed937f15.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59-20210416_111233.log.json) |
-| DeepLabV3+ | R-101-D8 | 480x480   |   80000 | -        | -              |  53.2 |         54.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59_20210416_111127-7ca0331d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59-20210416_111127.log.json) |
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                                                                   |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-101-D8 | 480x480   |   40000 | -        | -              | V100   | 52.86 |         54.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59_20210416_111233-ed937f15.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59-20210416_111233.log.json) |
+| DeepLabV3+ | R-101-D8 | 480x480   |   80000 | -        | -              | V100   |  53.2 |         54.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59_20210416_111127-7ca0331d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59-20210416_111127.log.json) |
 
 ### LoveDA
 
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                               | download                                                                                                                                                                                                                                                                                                                                                                       |
-| ---------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.93     | 25.57          | 50.28 |         50.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800-ce0fa0ca.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800.log.json)     |
-| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.37     | 6.00           | 50.99 |         50.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442-f0720392.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442.log.json)     |
-| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.84    | 4.33           | 51.47 |         51.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759-4c1f297e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759.log.json) |
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                       |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.93     | 25.57          | V100   | 50.28 |         50.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800-ce0fa0ca.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800.log.json)     |
+| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.37     | 6.00           | V100   | 50.99 |         50.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442-f0720392.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.84    | 4.33           | V100   | 51.47 |         51.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759-4c1f297e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759.log.json) |
 
 ### Potsdam
 
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                           |
-| ---------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.91     | 81.68          | 77.09 |         78.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601.log.json)     |
-| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.36     | 26.44          | 78.33 |         79.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508-7e7a2b24.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508.log.json)     |
-| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.83    | 17.56          |  78.7 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508-8b112708.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508.log.json) |
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                           |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.91     | 81.68          | V100   | 77.09 |         78.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601.log.json)     |
+| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.36     | 26.44          | V100   | 78.33 |         79.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508-7e7a2b24.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.83    | 17.56          | V100   |  78.7 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508-8b112708.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508.log.json) |
 
 ### Vaihingen
 
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                                   |
-| ---------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.91     | 72.79          | 72.50 |         74.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805-7626a263.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805.log.json)     |
-| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.36     | 26.91          | 73.97 |         75.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816-5040938d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json)     |
-| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.83    | 18.59          | 73.06 |         74.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816-8a095afa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json) |
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                                                                   |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-18-D8  | 512x512   |   80000 | 1.91     | 72.79          | V100   | 72.50 |         74.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805-7626a263.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805.log.json)     |
+| DeepLabV3+ | R-50-D8  | 512x512   |   80000 | 7.36     | 26.91          | V100   | 73.97 |         75.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816-5040938d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json)     |
+| DeepLabV3+ | R-101-D8 | 512x512   |   80000 | 10.83    | 18.59          | V100   | 73.06 |         74.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816-8a095afa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json) |
 
 ### iSAID
 
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                                                               |
-| ---------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DeepLabV3+ | R-18-D8  | 896x896   |   80000 | 6.19     | 24.81          | 61.35 |         62.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526-7059991d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
-| DeepLabV3+ | R-50-D8  | 896x896   |   80000 | 21.45    | 8.42           | 67.06 |         68.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526-598be439.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-18-D8  | 896x896   |   80000 | 6.19     | 24.81          | V100   | 61.35 |         62.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526-7059991d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
+| DeepLabV3+ | R-50-D8  | 896x896   |   80000 | 21.45    | 8.42           | V100   | 67.06 |         68.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526-598be439.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
+
+### Mapillary Vistas v1.2
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DeepLabV3+ | R-50-D8  | 1280x1280 |  300000 | 24.04    | 17.92          | A100   | 47.35 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280_20230301_110504-655f8e43.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280_20230301_110504.json) |
 
 Note:
 
@@ -130,3 +125,14 @@ Note:
 - `MG-124` stands for multi-grid dilation in the last stage of ResNet.
 - `FP16` means Mixed Precision (FP16) is adopted in training.
 - `896x896` is the Crop Size of iSAID dataset, which is followed by the implementation of [PointFlow: Flowing Semantics Through Points for Aerial Image Segmentation](https://arxiv.org/pdf/2103.06564.pdf)
+
+## Citation
+
+```bibtex
+@inproceedings{deeplabv3plus2018,
+  title={Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation},
+  author={Liang-Chieh Chen and Yukun Zhu and George Papandreou and Florian Schroff and Hartwig Adam},
+  booktitle={ECCV},
+  year={2018}
+}
+```
diff --git a/configs/deeplabv3plus/deeplabv3plus.yml b/configs/deeplabv3plus/deeplabv3plus.yml
deleted file mode 100644
index 56790c8428..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus.yml
+++ /dev/null
@@ -1,850 +0,0 @@
-Collections:
-- Name: DeepLabV3+
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-    - Pascal Context
-    - Pascal Context 59
-    - LoveDA
-    - Potsdam
-    - Vaihingen
-    - iSAID
-  Paper:
-    URL: https://arxiv.org/abs/1802.02611
-    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
-  README: configs/deeplabv3plus/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/tensorflow/models/tree/master/research/deeplab
-Models:
-- Name: deeplabv3plus_r50-d8_512x1024_40k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 253.81
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.61
-      mIoU(ms+flip): 81.01
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610-d222ffcd.pth
-- Name: deeplabv3plus_r101-d8_512x1024_40k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 384.62
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 11.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.21
-      mIoU(ms+flip): 81.82
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614-3769eecf.pth
-- Name: deeplabv3plus_r50-d8_769x769_40k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 581.4
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 8.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.97
-      mIoU(ms+flip): 80.46
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143-1dcb0e3c.pth
-- Name: deeplabv3plus_r101-d8_769x769_40k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 869.57
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 12.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.46
-      mIoU(ms+flip): 80.5
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304-ff414b9e.pth
-- Name: deeplabv3plus_r18-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 70.08
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 2.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.89
-      mIoU(ms+flip): 78.76
-  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth
-- Name: deeplabv3plus_r50-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.09
-      mIoU(ms+flip): 81.13
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049-f9fb496d.pth
-- Name: deeplabv3plus_r101-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.97
-      mIoU(ms+flip): 82.03
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143-068fcfe9.pth
-- Name: deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 127.06
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (512,1024)
-    Training Memory (GB): 6.35
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.46
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-f1104f4b.pth
-- Name: deeplabv3plus_r18-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-18-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 174.22
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 2.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.26
-      mIoU(ms+flip): 77.91
-  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes_20201226_083346-f326e06a.pth
-- Name: deeplabv3plus_r50-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.83
-      mIoU(ms+flip): 81.48
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233-0e9dfdc4.pth
-- Name: deeplabv3plus_r101-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.65
-      mIoU(ms+flip): 81.47
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720-dfcc0b68.pth
-- Name: deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D16-MG124
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 133.69
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.09
-      mIoU(ms+flip): 80.36
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-cf9ce186.pth
-- Name: deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D16-MG124
-    crop size: (512,1024)
-    lr schd: 80000
-    Training Memory (GB): 9.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.9
-      mIoU(ms+flip): 81.33
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-ee6158e0.pth
-- Name: deeplabv3plus_r18b-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-18b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 66.89
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 2.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.87
-      mIoU(ms+flip): 77.52
-  Config: configs/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes_20201226_090828-e451abd9.pth
-- Name: deeplabv3plus_r50b-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 253.81
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.28
-      mIoU(ms+flip): 81.44
-  Config: configs/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes_20201225_213645-a97e4e43.pth
-- Name: deeplabv3plus_r101b-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 384.62
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 10.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.16
-      mIoU(ms+flip): 81.41
-  Config: configs/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes_20201226_190843-9c3c93a4.pth
-- Name: deeplabv3plus_r18b-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-18b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 167.79
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 2.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.36
-      mIoU(ms+flip): 78.24
-  Config: configs/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes_20201226_151312-2c868aff.pth
-- Name: deeplabv3plus_r50b-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 581.4
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 8.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.41
-      mIoU(ms+flip): 80.56
-  Config: configs/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes_20201225_224655-8b596d1c.pth
-- Name: deeplabv3plus_r101b-d8_769x769_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 909.09
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 12.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.88
-      mIoU(ms+flip): 81.46
-  Config: configs/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes_20201226_205041-227cdf7c.pth
-- Name: deeplabv3plus_r50-d8_512x512_80k_ade20k
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 47.6
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.72
-      mIoU(ms+flip): 43.75
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028-bf1400d8.pth
-- Name: deeplabv3plus_r101-d8_512x512_80k_ade20k
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 70.62
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 14.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.6
-      mIoU(ms+flip): 46.06
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139-d5730af7.pth
-- Name: deeplabv3plus_r50-d8_512x512_160k_ade20k
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.95
-      mIoU(ms+flip): 44.93
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504-6135c7e0.pth
-- Name: deeplabv3plus_r101-d8_512x512_160k_ade20k
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.47
-      mIoU(ms+flip): 46.35
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232-38ed86bb.pth
-- Name: deeplabv3plus_r50-d8_512x512_20k_voc12aug
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 47.62
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 75.93
-      mIoU(ms+flip): 77.5
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323-aad58ef1.pth
-- Name: deeplabv3plus_r101-d8_512x512_20k_voc12aug
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 72.05
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 11.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.22
-      mIoU(ms+flip): 78.59
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345-c7ff3d56.pth
-- Name: deeplabv3plus_r50-d8_512x512_40k_voc12aug
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.81
-      mIoU(ms+flip): 77.57
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759-e1b43aa9.pth
-- Name: deeplabv3plus_r101-d8_512x512_40k_voc12aug
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 78.62
-      mIoU(ms+flip): 79.53
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333-faf03387.pth
-- Name: deeplabv3plus_r101-d8_480x480_40k_pascal_context
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 110.01
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (480,480)
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 47.3
-      mIoU(ms+flip): 48.47
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context_20200911_165459-d3c8a29e.pth
-- Name: deeplabv3plus_r101-d8_480x480_80k_pascal_context
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 47.23
-      mIoU(ms+flip): 48.26
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context_20200911_155322-145d3ee8.pth
-- Name: deeplabv3plus_r101-d8_480x480_40k_pascal_context_59
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 52.86
-      mIoU(ms+flip): 54.54
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59_20210416_111233-ed937f15.pth
-- Name: deeplabv3plus_r101-d8_480x480_80k_pascal_context_59
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 53.2
-      mIoU(ms+flip): 54.67
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59_20210416_111127-7ca0331d.pth
-- Name: deeplabv3plus_r18-d8_512x512_80k_loveda
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 39.11
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.93
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 50.28
-      mIoU(ms+flip): 50.47
-  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800-ce0fa0ca.pth
-- Name: deeplabv3plus_r50-d8_512x512_80k_loveda
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 166.67
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.37
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 50.99
-      mIoU(ms+flip): 50.65
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442-f0720392.pth
-- Name: deeplabv3plus_r101-d8_512x512_80k_loveda
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 230.95
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.84
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 51.47
-      mIoU(ms+flip): 51.32
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759-4c1f297e.pth
-- Name: deeplabv3plus_r18-d8_512x512_80k_potsdam
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 12.24
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.91
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 77.09
-      mIoU(ms+flip): 78.44
-  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth
-- Name: deeplabv3plus_r50-d8_512x512_80k_potsdam
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 37.82
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.36
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 78.33
-      mIoU(ms+flip): 79.27
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508-7e7a2b24.pth
-- Name: deeplabv3plus_r101-d8_512x512_80k_potsdam
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 56.95
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.83
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 78.7
-      mIoU(ms+flip): 79.47
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508-8b112708.pth
-- Name: deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 13.74
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.91
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 72.5
-      mIoU(ms+flip): 74.13
-  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805-7626a263.pth
-- Name: deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 37.16
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.36
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 73.97
-      mIoU(ms+flip): 75.05
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816-5040938d.pth
-- Name: deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 53.79
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.83
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 73.06
-      mIoU(ms+flip): 74.14
-  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816-8a095afa.pth
-- Name: deeplabv3plus_r18-d8_4x4_896x896_80k_isaid
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-18-D8
-    crop size: (896,896)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 40.31
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (896,896)
-    Training Memory (GB): 6.19
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: iSAID
-    Metrics:
-      mIoU: 61.35
-      mIoU(ms+flip): 62.61
-  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526-7059991d.pth
-- Name: deeplabv3plus_r50-d8_4x4_896x896_80k_isaid
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: R-50-D8
-    crop size: (896,896)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 118.76
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (896,896)
-    Training Memory (GB): 21.45
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: iSAID
-    Metrics:
-      mIoU: 67.06
-      mIoU(ms+flip): 68.02
-  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526-598be439.pth
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..71c9118e1d
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(
+        depth=101,
+        dilations=(1, 1, 1, 2),
+        strides=(1, 2, 2, 1),
+        multi_grid=(1, 2, 4)),
+    decode_head=dict(
+        dilations=(1, 6, 12, 18),
+        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..7d1ccf0b30
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(
+        depth=101,
+        dilations=(1, 1, 1, 2),
+        strides=(1, 2, 2, 1),
+        multi_grid=(1, 2, 4)),
+    decode_head=dict(
+        dilations=(1, 6, 12, 18),
+        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes.py
deleted file mode 100644
index bf39d2f12b..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet101_v1c',
-    backbone=dict(
-        depth=101,
-        dilations=(1, 1, 1, 2),
-        strides=(1, 2, 2, 1),
-        multi_grid=(1, 2, 4)),
-    decode_head=dict(
-        dilations=(1, 6, 12, 18),
-        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes.py
deleted file mode 100644
index c53ec41baf..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet101_v1c',
-    backbone=dict(
-        depth=101,
-        dilations=(1, 1, 1, 2),
-        strides=(1, 2, 2, 1),
-        multi_grid=(1, 2, 4)),
-    decode_head=dict(
-        dilations=(1, 6, 12, 18),
-        sampler=dict(type='OHEMPixelSampler', min_kept=100000)))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context.py
deleted file mode 100644
index 68e2b072e4..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_480x480_40k_pascal_context.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59.py
deleted file mode 100644
index 36a510ff41..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_480x480_40k_pascal_context_59.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context.py
deleted file mode 100644
index 3a46c28608..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_480x480_80k_pascal_context.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59.py
deleted file mode 100644
index a6a7688c7a..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_480x480_80k_pascal_context_59.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen.py
deleted file mode 100644
index 4bddf4f8bf..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..884b526d48
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..debb0255fc
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..bc9334e67d
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..4af9aa2682
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..9c9883dc4f
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = './deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=optimizer,
+    loss_scale=512.)
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..c38a802e10
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..97bb827722
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-480x480.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000..e4b401162d
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..eeefae4927
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..0755c53aae
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..844ac9613b
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000..87c6da9d6a
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-480x480.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000..115b1c9058
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..9aaa653822
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000..5063b1332c
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000..b99c2c7ee0
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index d6ce85aea5..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 0ebbd3c70e..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index a75c9d3019..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index ebb1a8eaee..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index 3caa6cf8ae..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 53fd3a9095..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda.py
deleted file mode 100644
index b3ad3cae2b..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x512_80k_loveda.py'
-model = dict(
-    backbone=dict(
-        depth=101,
-        init_cfg=dict(
-            type='Pretrained', checkpoint='open-mmlab://resnet101_v1c')))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam.py
deleted file mode 100644
index d89491440a..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x512_80k_potsdam.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index c3c92eb26f..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 5ea9cdb5b6..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes.py
deleted file mode 100644
index f92cf030e8..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,7 +0,0 @@
-_base_ = './deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py'
-optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=optimizer,
-    loss_scale=512.)
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..d1bcb09144
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..c78fc1e209
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 398d9759ca..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 136449083f..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen.py
deleted file mode 100644
index 879e941f29..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        c1_in_channels=64,
-        c1_channels=12,
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid.py
deleted file mode 100644
index 892a8a30e9..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_4x4_896x896_80k_isaid.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        c1_in_channels=64,
-        c1_channels=12,
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..5f54913e94
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..1b361d6d7a
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000..3a1a753b26
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000..01bbf9bca9
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000..134f2cfc2a
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000..2194838510
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index aff70c93e6..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        c1_in_channels=64,
-        c1_channels=12,
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda.py
deleted file mode 100644
index 11fe640234..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda.py
+++ /dev/null
@@ -1,13 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x512_80k_loveda.py'
-model = dict(
-    backbone=dict(
-        depth=18,
-        init_cfg=dict(
-            type='Pretrained', checkpoint='open-mmlab://resnet18_v1c')),
-    decode_head=dict(
-        c1_in_channels=64,
-        c1_channels=12,
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam.py
deleted file mode 100644
index ffb20df727..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x512_80k_potsdam.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        c1_in_channels=64,
-        c1_channels=12,
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 0172d9a87d..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        c1_in_channels=64,
-        c1_channels=12,
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..ea86219692
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..34ee7ed3df
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,11 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        c1_in_channels=64,
+        c1_channels=12,
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index b90b292b03..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet18',
-    backbone=dict(type='ResNet', depth=18),
-    decode_head=dict(
-        c1_in_channels=64,
-        c1_channels=12,
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index b49da3581d..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet18',
-    backbone=dict(type='ResNet', depth=18),
-    decode_head=dict(
-        c1_in_channels=64,
-        c1_channels=12,
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py
new file mode 100644
index 0000000000..133c45ae1d
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py
@@ -0,0 +1,58 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/mapillary_v1_65.py',
+    '../_base_/default_runtime.py',
+]
+
+crop_size = (1280, 1280)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(depth=50),
+    decode_head=dict(num_classes=65),
+    auxiliary_head=dict(num_classes=65))
+
+iters = 300000
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
+# optimizer
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={'backbone': dict(lr_mult=0.1, decay_mult=1.0)}))
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=iters,
+        by_epoch=False)
+]
+
+# training schedule for 300k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iters, val_interval=iters // 10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=iters // 10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+train_dataloader = dict(batch_size=2)
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (4 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=8)
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_480x480_40k_pascal_context.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_480x480_40k_pascal_context.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_480x480_40k_pascal_context_59.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_480x480_40k_pascal_context_59.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-59-480x480.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_480x480_80k_pascal_context.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_480x480_80k_pascal_context.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_480x480_80k_pascal_context_59.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_480x480_80k_pascal_context_59.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-59-480x480.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen.py b/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py
similarity index 100%
rename from configs/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen.py
rename to configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..3e2813534d
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py b/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..6366bd4e3a
--- /dev/null
+++ b/configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index dd8e1da9c7..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes.py b/configs/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index c0ba019136..0000000000
--- a/configs/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './deeplabv3plus_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/deeplabv3plus/metafile.yaml b/configs/deeplabv3plus/metafile.yaml
new file mode 100644
index 0000000000..b41de4dee2
--- /dev/null
+++ b/configs/deeplabv3plus/metafile.yaml
@@ -0,0 +1,1041 @@
+Collections:
+- Name: DeepLabV3+
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+    - Pascal Context
+    - Pascal Context 59
+    - LoveDA
+    - Potsdam
+    - Vaihingen
+    - iSAID
+    - Mapillary Vistas v1.2
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  README: configs/deeplabv3plus/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.61
+      mIoU(ms+flip): 81.01
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610-d222ffcd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_40k_cityscapes/deeplabv3plus_r50-d8_512x1024_40k_cityscapes_20200605_094610.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.21
+      mIoU(ms+flip): 81.82
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614-3769eecf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_40k_cityscapes/deeplabv3plus_r101-d8_512x1024_40k_cityscapes_20200605_094614.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.97
+      mIoU(ms+flip): 80.46
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143-1dcb0e3c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_40k_cityscapes/deeplabv3plus_r50-d8_769x769_40k_cityscapes_20200606_114143.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.46
+      mIoU(ms+flip): 80.5
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304-ff414b9e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_40k_cityscapes/deeplabv3plus_r101-d8_769x769_40k_cityscapes_20200606_114304.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.89
+      mIoU(ms+flip): 78.76
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes/deeplabv3plus_r18-d8_512x1024_80k_cityscapes-20201226_080942.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.09
+      mIoU(ms+flip): 81.13
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049-f9fb496d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x1024_80k_cityscapes/deeplabv3plus_r50-d8_512x1024_80k_cityscapes_20200606_114049.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.97
+      mIoU(ms+flip): 82.03
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143-068fcfe9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_512x1024_80k_cityscapes_20200606_114143.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.46
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    - (FP16)
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.35
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920-f1104f4b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes/deeplabv3plus_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230920.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.26
+      mIoU(ms+flip): 77.91
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes_20201226_083346-f326e06a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_769x769_80k_cityscapes/deeplabv3plus_r18-d8_769x769_80k_cityscapes-20201226_083346.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.83
+      mIoU(ms+flip): 81.48
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233-0e9dfdc4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_769x769_80k_cityscapes/deeplabv3plus_r50-d8_769x769_80k_cityscapes_20200606_210233.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.65
+      mIoU(ms+flip): 81.47
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720-dfcc0b68.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_769x769_80k_cityscapes/deeplabv3plus_r101-d8_769x769_80k_cityscapes_20220406_154720.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: ddeeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.09
+      mIoU(ms+flip): 80.36
+  Config: configs/deeplabv3plus/ddeeplabv3plus_r101-d16-mg124_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16-MG124
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes_20200908_005644-cf9ce186.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_40k_cityscapes-20200908_005644.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.9
+      mIoU(ms+flip): 81.33
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d16-mg124_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16-MG124
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes_20200908_005644-ee6158e0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes/deeplabv3plus_r101-d16-mg124_512x1024_80k_cityscapes-20200908_005644.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.87
+      mIoU(ms+flip): 77.52
+  Config: configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes_20201226_090828-e451abd9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes/deeplabv3plus_r18b-d8_512x1024_80k_cityscapes-20201226_090828.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.28
+      mIoU(ms+flip): 81.44
+  Config: configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes_20201225_213645-a97e4e43.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes/deeplabv3plus_r50b-d8_512x1024_80k_cityscapes-20201225_213645.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.16
+      mIoU(ms+flip): 81.41
+  Config: configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes_20201226_190843-9c3c93a4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes/deeplabv3plus_r101b-d8_512x1024_80k_cityscapes-20201226_190843.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.36
+      mIoU(ms+flip): 78.24
+  Config: configs/deeplabv3plus/deeplabv3plus_r18b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes_20201226_151312-2c868aff.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18b-d8_769x769_80k_cityscapes/deeplabv3plus_r18b-d8_769x769_80k_cityscapes-20201226_151312.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.41
+      mIoU(ms+flip): 80.56
+  Config: configs/deeplabv3plus/deeplabv3plus_r50b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes_20201225_224655-8b596d1c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50b-d8_769x769_80k_cityscapes/deeplabv3plus_r50b-d8_769x769_80k_cityscapes-20201225_224655.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.88
+      mIoU(ms+flip): 81.46
+  Config: configs/deeplabv3plus/deeplabv3plus_r101b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes_20201226_205041-227cdf7c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101b-d8_769x769_80k_cityscapes/deeplabv3plus_r101b-d8_769x769_80k_cityscapes-20201226_205041.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.72
+      mIoU(ms+flip): 43.75
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028-bf1400d8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_ade20k/deeplabv3plus_r50-d8_512x512_80k_ade20k_20200614_185028.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.6
+      mIoU(ms+flip): 46.06
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 14.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139-d5730af7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_ade20k/deeplabv3plus_r101-d8_512x512_80k_ade20k_20200615_014139.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.95
+      mIoU(ms+flip): 44.93
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504-6135c7e0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_160k_ade20k/deeplabv3plus_r50-d8_512x512_160k_ade20k_20200615_124504.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.47
+      mIoU(ms+flip): 46.35
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232-38ed86bb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k/deeplabv3plus_r101-d8_512x512_160k_ade20k_20200615_123232.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 75.93
+      mIoU(ms+flip): 77.5
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323-aad58ef1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_20k_voc12aug/deeplabv3plus_r50-d8_512x512_20k_voc12aug_20200617_102323.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.22
+      mIoU(ms+flip): 78.59
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345-c7ff3d56.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_20k_voc12aug/deeplabv3plus_r101-d8_512x512_20k_voc12aug_20200617_102345.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.81
+      mIoU(ms+flip): 77.57
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759-e1b43aa9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_40k_voc12aug/deeplabv3plus_r50-d8_512x512_40k_voc12aug_20200613_161759.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.62
+      mIoU(ms+flip): 79.53
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333-faf03387.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_40k_voc12aug/deeplabv3plus_r101-d8_512x512_40k_voc12aug_20200613_205333.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 47.3
+      mIoU(ms+flip): 48.47
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context_20200911_165459-d3c8a29e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context/deeplabv3plus_r101-d8_480x480_40k_pascal_context-20200911_165459.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 47.23
+      mIoU(ms+flip): 48.26
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context_20200911_155322-145d3ee8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context/deeplabv3plus_r101-d8_480x480_80k_pascal_context-20200911_155322.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.86
+      mIoU(ms+flip): 54.54
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59_20210416_111233-ed937f15.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59/deeplabv3plus_r101-d8_480x480_40k_pascal_context_59-20210416_111233.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 53.2
+      mIoU(ms+flip): 54.67
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59_20210416_111127-7ca0331d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59/deeplabv3plus_r101-d8_480x480_80k_pascal_context_59-20210416_111127.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 50.28
+      mIoU(ms+flip): 50.47
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.93
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800-ce0fa0ca.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_loveda/deeplabv3plus_r18-d8_512x512_80k_loveda_20211104_132800.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 50.99
+      mIoU(ms+flip): 50.65
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.37
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442-f0720392.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_loveda/deeplabv3plus_r50-d8_512x512_80k_loveda_20211105_080442.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 51.47
+      mIoU(ms+flip): 51.32
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.84
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759-4c1f297e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_loveda/deeplabv3plus_r101-d8_512x512_80k_loveda_20211105_110759.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 77.09
+      mIoU(ms+flip): 78.44
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.91
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_512x512_80k_potsdam/deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.33
+      mIoU(ms+flip): 79.27
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.36
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508-7e7a2b24.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_512x512_80k_potsdam/deeplabv3plus_r50-d8_512x512_80k_potsdam_20211219_031508.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.7
+      mIoU(ms+flip): 79.47
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.83
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508-8b112708.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_512x512_80k_potsdam/deeplabv3plus_r101-d8_512x512_80k_potsdam_20211219_031508.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.5
+      mIoU(ms+flip): 74.13
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.91
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805-7626a263.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r18-d8_4x4_512x512_80k_vaihingen_20211231_230805.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 73.97
+      mIoU(ms+flip): 75.05
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.36
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816-5040938d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r50-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 73.06
+      mIoU(ms+flip): 74.14
+  Config: configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.83
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816-8a095afa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen/deeplabv3plus_r101-d8_4x4_512x512_80k_vaihingen_20211231_230816.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 61.35
+      mIoU(ms+flip): 62.61
+  Config: configs/deeplabv3plus/deeplabv3plus_r18-d8_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.19
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526-7059991d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid/deeplabv3plus_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 67.06
+      mIoU(ms+flip): 68.02
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 21.45
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526-598be439.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid/deeplabv3plus_r50-d8_4x4_896x896_80k_isaid_20220110_180526.log.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
+- Name: deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Mapillary Vistas v1.2
+    Metrics:
+      mIoU: 47.35
+  Config: configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280.py
+  Metadata:
+    Training Data: Mapillary Vistas v1.2
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DeepLabV3+
+    Training Resources: 4x A100 GPUS
+    Memory (GB): 24.04
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280_20230301_110504-655f8e43.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280/deeplabv3plus_r50-d8_4xb2-300k_mapillay_v1_65-1280x1280_20230301_110504.json
+  Paper:
+    Title: Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation
+    URL: https://arxiv.org/abs/1802.02611
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/sep_aspp_head.py#L30
+  Framework: PyTorch
diff --git a/configs/dmnet/README.md b/configs/dmnet/README.md
index 301bd4599f..b0cf94455e 100644
--- a/configs/dmnet/README.md
+++ b/configs/dmnet/README.md
@@ -1,6 +1,6 @@
 # DMNet
 
-[Dynamic Multi-scale Filters for Semantic Segmentation](https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf)
+> [Dynamic Multi-scale Filters for Semantic Segmentation](https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf)
 
 ## Introduction
 
@@ -22,6 +22,30 @@ Multi-scale representation provides an effective way toaddress scale variation o
 <img src="https://user-images.githubusercontent.com/24582831/142900781-6215763f-8b71-4e0b-a6b1-c41372db2aa0.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DMNet  | R-50-D8  | 512x1024  |   40000 | 7.0      | 3.66           | V100   | 77.78 |         79.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes_20201215_042326-615373cf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes-20201215_042326.log.json)     |
+| DMNet  | R-101-D8 | 512x1024  |   40000 | 10.6     | 2.54           | V100   | 78.37 |         79.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes_20201215_043100-8291e976.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes-20201215_043100.log.json) |
+| DMNet  | R-50-D8  | 769x769   |   40000 | 7.9      | 1.57           | V100   | 78.49 |         80.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes_20201215_093706-e7f0e23e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes-20201215_093706.log.json)         |
+| DMNet  | R-101-D8 | 769x769   |   40000 | 12.0     | 1.01           | V100   | 77.62 |         78.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes_20201215_081348-a74261f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes-20201215_081348.log.json)     |
+| DMNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 79.07 |         80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes_20201215_053728-3c8893b9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes-20201215_053728.log.json)     |
+| DMNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 79.64 |         80.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes_20201215_031718-fa081cb8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes-20201215_031718.log.json) |
+| DMNet  | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.22 |         80.55 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes_20201215_034006-6060840e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes-20201215_034006.log.json)         |
+| DMNet  | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.19 |         80.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes_20201215_082810-7f0de59a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes-20201215_082810.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DMNet  | R-50-D8  | 512x512   |   80000 | 9.4      | 20.95          | V100   | 42.37 |         43.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k_20201215_144744-f89092a6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k-20201215_144744.log.json)         |
+| DMNet  | R-101-D8 | 512x512   |   80000 | 13.0     | 13.88          | V100   | 45.34 |         46.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k_20201215_104812-bfa45311.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k-20201215_104812.log.json)     |
+| DMNet  | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 43.15 |         44.17 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k_20201215_115313-025ab3f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k-20201215_115313.log.json)     |
+| DMNet  | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 45.42 |         46.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k_20201215_111145-a0bc02ef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k-20201215_111145.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,27 +57,3 @@ month = {October},
 year = {2019}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DMNet  | R-50-D8  | 512x1024  |   40000 | 7.0      | 3.66           | 77.78 |         79.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes_20201215_042326-615373cf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes-20201215_042326.log.json)     |
-| DMNet  | R-101-D8 | 512x1024  |   40000 | 10.6     | 2.54           | 78.37 |         79.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes_20201215_043100-8291e976.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes-20201215_043100.log.json) |
-| DMNet  | R-50-D8  | 769x769   |   40000 | 7.9      | 1.57           | 78.49 |         80.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes_20201215_093706-e7f0e23e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes-20201215_093706.log.json)         |
-| DMNet  | R-101-D8 | 769x769   |   40000 | 12.0     | 1.01           | 77.62 |         78.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes_20201215_081348-a74261f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes-20201215_081348.log.json)     |
-| DMNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | 79.07 |         80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes_20201215_053728-3c8893b9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes-20201215_053728.log.json)     |
-| DMNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | 79.64 |         80.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes_20201215_031718-fa081cb8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes-20201215_031718.log.json) |
-| DMNet  | R-50-D8  | 769x769   |   80000 | -        | -              | 79.22 |         80.55 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes_20201215_034006-6060840e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes-20201215_034006.log.json)         |
-| DMNet  | R-101-D8 | 769x769   |   80000 | -        | -              | 79.19 |         80.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes_20201215_082810-7f0de59a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes-20201215_082810.log.json)     |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DMNet  | R-50-D8  | 512x512   |   80000 | 9.4      | 20.95          | 42.37 |         43.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k_20201215_144744-f89092a6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k-20201215_144744.log.json)         |
-| DMNet  | R-101-D8 | 512x512   |   80000 | 13.0     | 13.88          | 45.34 |         46.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k_20201215_104812-bfa45311.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k-20201215_104812.log.json)     |
-| DMNet  | R-50-D8  | 512x512   |  160000 | -        | -              | 43.15 |         44.17 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k_20201215_115313-025ab3f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k-20201215_115313.log.json)     |
-| DMNet  | R-101-D8 | 512x512   |  160000 | -        | -              | 45.42 |         46.76 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet/dmnet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k_20201215_111145-a0bc02ef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k-20201215_111145.log.json) |
diff --git a/configs/dmnet/dmnet.yml b/configs/dmnet/dmnet.yml
deleted file mode 100644
index 1fab2dc7a7..0000000000
--- a/configs/dmnet/dmnet.yml
+++ /dev/null
@@ -1,232 +0,0 @@
-Collections:
-- Name: DMNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-  Paper:
-    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
-    Title: Dynamic Multi-scale Filters for Semantic Segmentation
-  README: configs/dmnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/Junjun2016/DMNet
-Models:
-- Name: dmnet_r50-d8_512x1024_40k_cityscapes
-  In Collection: DMNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 273.22
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.78
-      mIoU(ms+flip): 79.14
-  Config: configs/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes_20201215_042326-615373cf.pth
-- Name: dmnet_r101-d8_512x1024_40k_cityscapes
-  In Collection: DMNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 393.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 10.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.37
-      mIoU(ms+flip): 79.72
-  Config: configs/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes_20201215_043100-8291e976.pth
-- Name: dmnet_r50-d8_769x769_40k_cityscapes
-  In Collection: DMNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 636.94
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 7.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.49
-      mIoU(ms+flip): 80.27
-  Config: configs/dmnet/dmnet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes_20201215_093706-e7f0e23e.pth
-- Name: dmnet_r101-d8_769x769_40k_cityscapes
-  In Collection: DMNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 990.1
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 12.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.62
-      mIoU(ms+flip): 78.94
-  Config: configs/dmnet/dmnet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes_20201215_081348-a74261f6.pth
-- Name: dmnet_r50-d8_512x1024_80k_cityscapes
-  In Collection: DMNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.07
-      mIoU(ms+flip): 80.22
-  Config: configs/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes_20201215_053728-3c8893b9.pth
-- Name: dmnet_r101-d8_512x1024_80k_cityscapes
-  In Collection: DMNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.64
-      mIoU(ms+flip): 80.67
-  Config: configs/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes_20201215_031718-fa081cb8.pth
-- Name: dmnet_r50-d8_769x769_80k_cityscapes
-  In Collection: DMNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.22
-      mIoU(ms+flip): 80.55
-  Config: configs/dmnet/dmnet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes_20201215_034006-6060840e.pth
-- Name: dmnet_r101-d8_769x769_80k_cityscapes
-  In Collection: DMNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.19
-      mIoU(ms+flip): 80.65
-  Config: configs/dmnet/dmnet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes_20201215_082810-7f0de59a.pth
-- Name: dmnet_r50-d8_512x512_80k_ade20k
-  In Collection: DMNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 47.73
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.37
-      mIoU(ms+flip): 43.62
-  Config: configs/dmnet/dmnet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k_20201215_144744-f89092a6.pth
-- Name: dmnet_r101-d8_512x512_80k_ade20k
-  In Collection: DMNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 72.05
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 13.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.34
-      mIoU(ms+flip): 46.13
-  Config: configs/dmnet/dmnet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k_20201215_104812-bfa45311.pth
-- Name: dmnet_r50-d8_512x512_160k_ade20k
-  In Collection: DMNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.15
-      mIoU(ms+flip): 44.17
-  Config: configs/dmnet/dmnet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k_20201215_115313-025ab3f9.pth
-- Name: dmnet_r101-d8_512x512_160k_ade20k
-  In Collection: DMNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.42
-      mIoU(ms+flip): 46.76
-  Config: configs/dmnet/dmnet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k_20201215_111145-a0bc02ef.pth
diff --git a/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..9832b62a29
--- /dev/null
+++ b/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..03346c5d9b
--- /dev/null
+++ b/configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..fd7e9acd1c
--- /dev/null
+++ b/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..2205e601ce
--- /dev/null
+++ b/configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..23e215bf2f
--- /dev/null
+++ b/configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..5c25587e64
--- /dev/null
+++ b/configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './dmnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes.py b/configs/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index fd6897691d..0000000000
--- a/configs/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dmnet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes.py b/configs/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 116cbdcede..0000000000
--- a/configs/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dmnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_512x512_160k_ade20k.py b/configs/dmnet/dmnet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index d78d46c040..0000000000
--- a/configs/dmnet/dmnet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dmnet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_512x512_80k_ade20k.py b/configs/dmnet/dmnet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 9713b731a4..0000000000
--- a/configs/dmnet/dmnet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dmnet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_769x769_40k_cityscapes.py b/configs/dmnet/dmnet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 6b222e7300..0000000000
--- a/configs/dmnet/dmnet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dmnet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r101-d8_769x769_80k_cityscapes.py b/configs/dmnet/dmnet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index f36d490e9c..0000000000
--- a/configs/dmnet/dmnet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dmnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes.py b/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/dmnet/dmnet_r50-d8_769x769_40k_cityscapes.py b/configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/dmnet/dmnet_r50-d8_769x769_40k_cityscapes.py
rename to configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes.py b/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/dmnet/dmnet_r50-d8_769x769_80k_cityscapes.py b/configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/dmnet/dmnet_r50-d8_769x769_80k_cityscapes.py
rename to configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/dmnet/dmnet_r50-d8_512x512_160k_ade20k.py b/configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/dmnet/dmnet_r50-d8_512x512_160k_ade20k.py
rename to configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/dmnet/dmnet_r50-d8_512x512_80k_ade20k.py b/configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/dmnet/dmnet_r50-d8_512x512_80k_ade20k.py
rename to configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/dmnet/metafile.yaml b/configs/dmnet/metafile.yaml
new file mode 100644
index 0000000000..7f5e536753
--- /dev/null
+++ b/configs/dmnet/metafile.yaml
@@ -0,0 +1,296 @@
+Collections:
+- Name: DMNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  README: configs/dmnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: dmnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.78
+      mIoU(ms+flip): 79.14
+  Config: configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes_20201215_042326-615373cf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_40k_cityscapes/dmnet_r50-d8_512x1024_40k_cityscapes-20201215_042326.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.37
+      mIoU(ms+flip): 79.72
+  Config: configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes_20201215_043100-8291e976.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_40k_cityscapes/dmnet_r101-d8_512x1024_40k_cityscapes-20201215_043100.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.49
+      mIoU(ms+flip): 80.27
+  Config: configs/dmnet/dmnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes_20201215_093706-e7f0e23e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_40k_cityscapes/dmnet_r50-d8_769x769_40k_cityscapes-20201215_093706.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.62
+      mIoU(ms+flip): 78.94
+  Config: configs/dmnet/dmnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes_20201215_081348-a74261f6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_40k_cityscapes/dmnet_r101-d8_769x769_40k_cityscapes-20201215_081348.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.07
+      mIoU(ms+flip): 80.22
+  Config: configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes_20201215_053728-3c8893b9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x1024_80k_cityscapes/dmnet_r50-d8_512x1024_80k_cityscapes-20201215_053728.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.64
+      mIoU(ms+flip): 80.67
+  Config: configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes_20201215_031718-fa081cb8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x1024_80k_cityscapes/dmnet_r101-d8_512x1024_80k_cityscapes-20201215_031718.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.22
+      mIoU(ms+flip): 80.55
+  Config: configs/dmnet/dmnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes_20201215_034006-6060840e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_769x769_80k_cityscapes/dmnet_r50-d8_769x769_80k_cityscapes-20201215_034006.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.19
+      mIoU(ms+flip): 80.65
+  Config: configs/dmnet/dmnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes_20201215_082810-7f0de59a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_769x769_80k_cityscapes/dmnet_r101-d8_769x769_80k_cityscapes-20201215_082810.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.37
+      mIoU(ms+flip): 43.62
+  Config: configs/dmnet/dmnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k_20201215_144744-f89092a6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_80k_ade20k/dmnet_r50-d8_512x512_80k_ade20k-20201215_144744.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.34
+      mIoU(ms+flip): 46.13
+  Config: configs/dmnet/dmnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k_20201215_104812-bfa45311.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_80k_ade20k/dmnet_r101-d8_512x512_80k_ade20k-20201215_104812.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.15
+      mIoU(ms+flip): 44.17
+  Config: configs/dmnet/dmnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k_20201215_115313-025ab3f9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r50-d8_512x512_160k_ade20k/dmnet_r50-d8_512x512_160k_ade20k-20201215_115313.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
+- Name: dmnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DMNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.42
+      mIoU(ms+flip): 46.76
+  Config: configs/dmnet/dmnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DMNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k_20201215_111145-a0bc02ef.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dmnet/dmnet_r101-d8_512x512_160k_ade20k/dmnet_r101-d8_512x512_160k_ade20k-20201215_111145.log.json
+  Paper:
+    Title: Dynamic Multi-scale Filters for Semantic Segmentation
+    URL: https://openaccess.thecvf.com/content_ICCV_2019/papers/He_Dynamic_Multi-Scale_Filters_for_Semantic_Segmentation_ICCV_2019_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dm_head.py#L93
+  Framework: PyTorch
diff --git a/configs/dnlnet/README.md b/configs/dnlnet/README.md
index 975c4b08b0..6835ffd1ed 100644
--- a/configs/dnlnet/README.md
+++ b/configs/dnlnet/README.md
@@ -1,6 +1,6 @@
 # DNLNet
 
-[Disentangled Non-Local Neural Networks](https://arxiv.org/abs/2006.06668)
+> [Disentangled Non-Local Neural Networks](https://arxiv.org/abs/2006.06668)
 
 ## Introduction
 
@@ -22,7 +22,31 @@ The non-local block is a popular module for strengthening the context modeling a
 <img src="https://user-images.githubusercontent.com/24582831/142900944-b8d93301-d2ce-488e-a461-b0813f96be49.png" width="70%"/>
 </div>
 
-## Citation
+## Results and models (in progress)
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DNLNet | R-50-D8  | 512x1024  |   40000 |      7.3 | 2.56           | V100   | 78.61 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes_20200904_233629-53d4ea93.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes-20200904_233629.log.json)     |
+| DNLNet | R-101-D8 | 512x1024  |   40000 |     10.9 | 1.96           | V100   | 78.31 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes_20200904_233629-9928ffef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes-20200904_233629.log.json) |
+| DNLNet | R-50-D8  | 769x769   |   40000 |      9.2 | 1.50           | V100   | 78.44 | 80.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes_20200820_232206-0f283785.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes-20200820_232206.log.json)         |
+| DNLNet | R-101-D8 | 769x769   |   40000 |     12.6 | 1.02           | V100   | 76.39 | 77.77         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes_20200820_171256-76c596df.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes-20200820_171256.log.json)     |
+| DNLNet | R-50-D8  | 512x1024  |   80000 |        - | -              | V100   | 79.33 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes_20200904_233629-58b2f778.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes-20200904_233629.log.json)     |
+| DNLNet | R-101-D8 | 512x1024  |   80000 |        - | -              | V100   | 80.41 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes_20200904_233629-758e2dd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes-20200904_233629.log.json) |
+| DNLNet | R-50-D8  | 769x769   |   80000 |        - | -              | V100   | 79.36 | 80.70         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes_20200820_011925-366bc4c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes-20200820_011925.log.json)         |
+| DNLNet | R-101-D8 | 769x769   |   80000 |        - | -              | V100   | 79.41 | 80.68         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes_20200821_051111-95ff84ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes-20200821_051111.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| DNLNet | R-50-D8  | 512x512   |   80000 |      8.8 | 20.66          | V100   | 41.76 | 42.99         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k_20200826_183354-1cf6e0c1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k-20200826_183354.log.json)         |
+| DNLNet | R-101-D8 | 512x512   |   80000 |     12.8 | 12.54          | V100   | 43.76 | 44.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k_20200826_183354-d820d6ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k-20200826_183354.log.json)     |
+| DNLNet | R-50-D8  | 512x512   |  160000 |        - | -              | V100   | 41.87 | 43.01         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k_20200826_183350-37837798.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k-20200826_183350.log.json)     |
+| DNLNet | R-101-D8 | 512x512   |  160000 |        - | -              | V100   | 44.25 | 45.78         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k_20200826_183350-ed522c61.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k-20200826_183350.log.json) |
+
+## Notes
 
 This example is to reproduce ["Disentangled Non-Local Neural Networks"](https://arxiv.org/abs/2006.06668) for semantic segmentation. It is still in progress.
 
@@ -36,27 +60,3 @@ This example is to reproduce ["Disentangled Non-Local Neural Networks"](https://
     booktitle={ECCV}
 }
 ```
-
-## Results and models (in progress)
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DNLNet | R-50-D8  | 512x1024  |   40000 |      7.3 | 2.56           | 78.61 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes_20200904_233629-53d4ea93.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes-20200904_233629.log.json)     |
-| DNLNet | R-101-D8 | 512x1024  |   40000 |     10.9 | 1.96           | 78.31 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes_20200904_233629-9928ffef.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes-20200904_233629.log.json) |
-| DNLNet | R-50-D8  | 769x769   |   40000 |      9.2 | 1.50           | 78.44 | 80.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes_20200820_232206-0f283785.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes-20200820_232206.log.json)         |
-| DNLNet | R-101-D8 | 769x769   |   40000 |     12.6 | 1.02           | 76.39 | 77.77         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes_20200820_171256-76c596df.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes-20200820_171256.log.json)     |
-| DNLNet | R-50-D8  | 512x1024  |   80000 |        - | -              | 79.33 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes_20200904_233629-58b2f778.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes-20200904_233629.log.json)     |
-| DNLNet | R-101-D8 | 512x1024  |   80000 |        - | -              | 80.41 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes_20200904_233629-758e2dd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes-20200904_233629.log.json) |
-| DNLNet | R-50-D8  | 769x769   |   80000 |        - | -              | 79.36 | 80.70         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes_20200820_011925-366bc4c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes-20200820_011925.log.json)         |
-| DNLNet | R-101-D8 | 769x769   |   80000 |        - | -              | 79.41 | 80.68         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes_20200821_051111-95ff84ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes-20200821_051111.log.json)     |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | -------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| DNLNet | R-50-D8  | 512x512   |   80000 |      8.8 | 20.66          | 41.76 | 42.99         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k_20200826_183354-1cf6e0c1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k-20200826_183354.log.json)         |
-| DNLNet | R-101-D8 | 512x512   |   80000 |     12.8 | 12.54          | 43.76 | 44.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k_20200826_183354-d820d6ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k-20200826_183354.log.json)     |
-| DNLNet | R-50-D8  | 512x512   |  160000 |        - | -              | 41.87 | 43.01         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k_20200826_183350-37837798.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k-20200826_183350.log.json)     |
-| DNLNet | R-101-D8 | 512x512   |  160000 |        - | -              | 44.25 | 45.78         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet/dnl_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k_20200826_183350-ed522c61.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k-20200826_183350.log.json) |
diff --git a/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..310d84e574
--- /dev/null
+++ b/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..a94dbb89b3
--- /dev/null
+++ b/configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..f9b6d5ee3d
--- /dev/null
+++ b/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..9c7d557d02
--- /dev/null
+++ b/configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..1edc26fd8c
--- /dev/null
+++ b/configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..d29c17ef5b
--- /dev/null
+++ b/configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './dnl_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes.py b/configs/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index 1a36e3c80a..0000000000
--- a/configs/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dnl_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes.py b/configs/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 0f2e1b6da7..0000000000
--- a/configs/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dnl_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_512x512_160k_ade20k.py b/configs/dnlnet/dnl_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index aca44e478b..0000000000
--- a/configs/dnlnet/dnl_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dnl_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_512x512_80k_ade20k.py b/configs/dnlnet/dnl_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index ebd27a1d1c..0000000000
--- a/configs/dnlnet/dnl_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dnl_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_769x769_40k_cityscapes.py b/configs/dnlnet/dnl_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 575e9d0134..0000000000
--- a/configs/dnlnet/dnl_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dnl_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r101-d8_769x769_80k_cityscapes.py b/configs/dnlnet/dnl_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 4f1b9e1941..0000000000
--- a/configs/dnlnet/dnl_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './dnl_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes.py b/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes.py
rename to configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/dnlnet/dnl_r50-d8_769x769_40k_cityscapes.py b/configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/dnlnet/dnl_r50-d8_769x769_40k_cityscapes.py
rename to configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes.py b/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes.py
rename to configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/dnlnet/dnl_r50-d8_769x769_80k_cityscapes.py b/configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/dnlnet/dnl_r50-d8_769x769_80k_cityscapes.py
rename to configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/dnlnet/dnl_r50-d8_512x512_160k_ade20k.py b/configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/dnlnet/dnl_r50-d8_512x512_160k_ade20k.py
rename to configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/dnlnet/dnl_r50-d8_512x512_80k_ade20k.py b/configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/dnlnet/dnl_r50-d8_512x512_80k_ade20k.py
rename to configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/dnlnet/dnlnet.yml b/configs/dnlnet/dnlnet.yml
deleted file mode 100644
index 8ee7b54861..0000000000
--- a/configs/dnlnet/dnlnet.yml
+++ /dev/null
@@ -1,228 +0,0 @@
-Collections:
-- Name: DNLNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-  Paper:
-    URL: https://arxiv.org/abs/2006.06668
-    Title: Disentangled Non-Local Neural Networks
-  README: configs/dnlnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/yinmh17/DNL-Semantic-Segmentation
-Models:
-- Name: dnl_r50-d8_512x1024_40k_cityscapes
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 390.62
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.61
-  Config: configs/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes_20200904_233629-53d4ea93.pth
-- Name: dnl_r101-d8_512x1024_40k_cityscapes
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 510.2
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 10.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.31
-  Config: configs/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes_20200904_233629-9928ffef.pth
-- Name: dnl_r50-d8_769x769_40k_cityscapes
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 666.67
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 9.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.44
-      mIoU(ms+flip): 80.27
-  Config: configs/dnlnet/dnl_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes_20200820_232206-0f283785.pth
-- Name: dnl_r101-d8_769x769_40k_cityscapes
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 980.39
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 12.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.39
-      mIoU(ms+flip): 77.77
-  Config: configs/dnlnet/dnl_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes_20200820_171256-76c596df.pth
-- Name: dnl_r50-d8_512x1024_80k_cityscapes
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.33
-  Config: configs/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes_20200904_233629-58b2f778.pth
-- Name: dnl_r101-d8_512x1024_80k_cityscapes
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.41
-  Config: configs/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes_20200904_233629-758e2dd4.pth
-- Name: dnl_r50-d8_769x769_80k_cityscapes
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.36
-      mIoU(ms+flip): 80.7
-  Config: configs/dnlnet/dnl_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes_20200820_011925-366bc4c7.pth
-- Name: dnl_r101-d8_769x769_80k_cityscapes
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.41
-      mIoU(ms+flip): 80.68
-  Config: configs/dnlnet/dnl_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes_20200821_051111-95ff84ab.pth
-- Name: dnl_r50-d8_512x512_80k_ade20k
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 48.4
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.76
-      mIoU(ms+flip): 42.99
-  Config: configs/dnlnet/dnl_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k_20200826_183354-1cf6e0c1.pth
-- Name: dnl_r101-d8_512x512_80k_ade20k
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 79.74
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.76
-      mIoU(ms+flip): 44.91
-  Config: configs/dnlnet/dnl_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k_20200826_183354-d820d6ea.pth
-- Name: dnl_r50-d8_512x512_160k_ade20k
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.87
-      mIoU(ms+flip): 43.01
-  Config: configs/dnlnet/dnl_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k_20200826_183350-37837798.pth
-- Name: dnl_r101-d8_512x512_160k_ade20k
-  In Collection: DNLNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.25
-      mIoU(ms+flip): 45.78
-  Config: configs/dnlnet/dnl_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k_20200826_183350-ed522c61.pth
diff --git a/configs/dnlnet/metafile.yaml b/configs/dnlnet/metafile.yaml
new file mode 100644
index 0000000000..22e48d3dc5
--- /dev/null
+++ b/configs/dnlnet/metafile.yaml
@@ -0,0 +1,292 @@
+Collections:
+- Name: DNLNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  README: configs/dnlnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: dnl_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.61
+  Config: configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes_20200904_233629-53d4ea93.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_40k_cityscapes/dnl_r50-d8_512x1024_40k_cityscapes-20200904_233629.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.31
+  Config: configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes_20200904_233629-9928ffef.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_40k_cityscapes/dnl_r101-d8_512x1024_40k_cityscapes-20200904_233629.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.44
+      mIoU(ms+flip): 80.27
+  Config: configs/dnlnet/dnl_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes_20200820_232206-0f283785.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_40k_cityscapes/dnl_r50-d8_769x769_40k_cityscapes-20200820_232206.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.39
+      mIoU(ms+flip): 77.77
+  Config: configs/dnlnet/dnl_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes_20200820_171256-76c596df.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_40k_cityscapes/dnl_r101-d8_769x769_40k_cityscapes-20200820_171256.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.33
+  Config: configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes_20200904_233629-58b2f778.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x1024_80k_cityscapes/dnl_r50-d8_512x1024_80k_cityscapes-20200904_233629.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.41
+  Config: configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes_20200904_233629-758e2dd4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x1024_80k_cityscapes/dnl_r101-d8_512x1024_80k_cityscapes-20200904_233629.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.36
+      mIoU(ms+flip): 80.7
+  Config: configs/dnlnet/dnl_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes_20200820_011925-366bc4c7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_769x769_80k_cityscapes/dnl_r50-d8_769x769_80k_cityscapes-20200820_011925.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.41
+      mIoU(ms+flip): 80.68
+  Config: configs/dnlnet/dnl_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes_20200821_051111-95ff84ab.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_769x769_80k_cityscapes/dnl_r101-d8_769x769_80k_cityscapes-20200821_051111.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.76
+      mIoU(ms+flip): 42.99
+  Config: configs/dnlnet/dnl_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k_20200826_183354-1cf6e0c1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_80k_ade20k/dnl_r50-d8_512x512_80k_ade20k-20200826_183354.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.76
+      mIoU(ms+flip): 44.91
+  Config: configs/dnlnet/dnl_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k_20200826_183354-d820d6ea.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_80k_ade20k/dnl_r101-d8_512x512_80k_ade20k-20200826_183354.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.87
+      mIoU(ms+flip): 43.01
+  Config: configs/dnlnet/dnl_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k_20200826_183350-37837798.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r50-d8_512x512_160k_ade20k/dnl_r50-d8_512x512_160k_ade20k-20200826_183350.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
+- Name: dnl_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: DNLNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.25
+      mIoU(ms+flip): 45.78
+  Config: configs/dnlnet/dnl_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - DNLNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k_20200826_183350-ed522c61.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dnlnet/dnl_r101-d8_512x512_160k_ade20k/dnl_r101-d8_512x512_160k_ade20k-20200826_183350.log.json
+  Paper:
+    Title: Disentangled Non-Local Neural Networks
+    URL: https://arxiv.org/abs/2006.06668
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dnl_head.py#L88
+  Framework: PyTorch
diff --git a/configs/dpt/README.md b/configs/dpt/README.md
index 5e6257711f..b3a5573a65 100644
--- a/configs/dpt/README.md
+++ b/configs/dpt/README.md
@@ -1,6 +1,6 @@
 # DPT
 
-[Vision Transformer for Dense Prediction](https://arxiv.org/abs/2103.13413)
+> [Vision Transformer for Dense Prediction](https://arxiv.org/abs/2103.13413)
 
 ## Introduction
 
@@ -22,24 +22,6 @@ We introduce dense vision transformers, an architecture that leverages vision tr
 <img src="https://user-images.githubusercontent.com/24582831/142901057-00aabea5-dab4-43d3-a14a-5f73eb5dd9b9.png" width="80%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{dosoViTskiy2020,
-  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
-  author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
-  journal={arXiv preprint arXiv:2010.11929},
-  year={2020}
-}
-
-@article{Ranftl2021,
-  author    = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
-  title     = {Vision Transformers for Dense Prediction},
-  journal   = {ArXiv preprint},
-  year      = {2021},
-}
-```
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
@@ -62,6 +44,24 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in
 
 ### ADE20K
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                            | download                                                                                                                                                                                                                                                                                               |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| DPT    | ViT-B    | 512x512   |  160000 | 8.09     | 10.41          | 46.97 |         48.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| DPT    | ViT-B    | 512x512   |  160000 | 8.09     | 10.41          | V100   | 46.97 |         48.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json) |
+
+## Citation
+
+```bibtex
+@article{dosoViTskiy2020,
+  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
+  author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
+  journal={arXiv preprint arXiv:2010.11929},
+  year={2020}
+}
+
+@article{Ranftl2021,
+  author    = {Ren\'{e} Ranftl and Alexey Bochkovskiy and Vladlen Koltun},
+  title     = {Vision Transformers for Dense Prediction},
+  journal   = {ArXiv preprint},
+  year      = {2021},
+}
+```
diff --git a/configs/dpt/dpt.yml b/configs/dpt/dpt.yml
deleted file mode 100644
index a4f9c65b79..0000000000
--- a/configs/dpt/dpt.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-Collections:
-- Name: DPT
-  Metadata:
-    Training Data:
-    - ADE20K
-  Paper:
-    URL: https://arxiv.org/abs/2103.13413
-    Title: Vision Transformer for Dense Prediction
-  README: configs/dpt/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dpt_head.py#L215
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/isl-org/DPT
-Models:
-- Name: dpt_vit-b16_512x512_160k_ade20k
-  In Collection: DPT
-  Metadata:
-    backbone: ViT-B
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 96.06
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.09
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.97
-      mIoU(ms+flip): 48.34
-  Config: configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth
diff --git a/configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py b/configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py
similarity index 100%
rename from configs/dpt/dpt_vit-b16_512x512_160k_ade20k.py
rename to configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py
diff --git a/configs/dpt/metafile.yaml b/configs/dpt/metafile.yaml
new file mode 100644
index 0000000000..b721e041b6
--- /dev/null
+++ b/configs/dpt/metafile.yaml
@@ -0,0 +1,37 @@
+Collections:
+- Name: DPT
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    Title: Vision Transformer for Dense Prediction
+    URL: https://arxiv.org/abs/2103.13413
+  README: configs/dpt/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: dpt_vit-b16_8xb2-160k_ade20k-512x512
+  In Collection: DPT
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.97
+      mIoU(ms+flip): 48.34
+  Config: configs/dpt/dpt_vit-b16_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - DPT
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.09
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-db31cf52.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/dpt/dpt_vit-b16_512x512_160k_ade20k/dpt_vit-b16_512x512_160k_ade20k-20210809_172025.log.json
+  Paper:
+    Title: Vision Transformer for Dense Prediction
+    URL: https://arxiv.org/abs/2103.13413
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/dpt_head.py#L215
+  Framework: PyTorch
diff --git a/configs/dsdl/README.md b/configs/dsdl/README.md
new file mode 100644
index 0000000000..e564cffdb2
--- /dev/null
+++ b/configs/dsdl/README.md
@@ -0,0 +1,103 @@
+# DSDL: Standard Description Language for DataSet
+
+<!-- [SKIP DEV CHECK] -->
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Data is the cornerstone of artificial intelligence. The efficiency of data acquisition, exchange, and application directly impacts the advances in technologies and applications. Over the long history of AI, a vast quantity of data sets have been developed and distributed. However, these datasets are defined in very different forms, which incurs significant overhead when it comes to exchange, integration, and utilization -- it is often the case that one needs to develop a new customized tool or script in order to incorporate a new dataset into a workflow.
+
+To overcome such difficulties, we develop **Data Set Description Language (DSDL)**. More details please visit our [official documents](https://opendatalab.github.io/dsdl-docs/getting_started/overview/), dsdl datasets can be downloaded from our platform [OpenDataLab](https://opendatalab.com/).
+
+<!-- [IMAGE] -->
+
+## Steps
+
+- install dsdl and opendatalab:
+
+  ```
+  pip install dsdl
+  pip install opendatalab
+  ```
+
+- install mmseg and pytorch:
+  please refer this [installation documents](https://mmsegmentation.readthedocs.io/en/latest/get_started.html).
+
+- prepare dsdl dataset (take voc2012 as an example)
+
+  - dowaload dsdl dataset (you will need an opendatalab account to do so. [register one now](https://opendatalab.com/))
+
+    ```
+    cd data
+
+    odl login
+    odl get PASCAL_VOC2012
+    ```
+
+    usually, dataset is compressed on opendatalab platform, the downloaded voc 2012 dataset should be like this:
+
+    ```
+    data/
+    ├── PASCAL_VOC2012
+    │   ├── dsdl
+    │   │   ├── dsdl_Det_full.zip
+    │   │   └── dsdl_SemSeg_full.zip
+    │   ├── raw
+    │   │   ├── VOC2012test.tar
+    │   │   ├── VOCdevkit_18-May-2011.tar
+    │   │   └── VOCtrainval_11-May-2012.tar
+    │   └── README.md
+    └── ...
+    ```
+
+  - decompress dataset
+
+    ```
+    cd dsdl
+    unzip dsdl_SemSeg_full.zip
+    ```
+
+    as we do not need detection dsdl files, we only decompress the semantic segmentation files here.
+
+    ```
+    cd ../raw
+    tar -xvf VOCtrainval_11-May-2012.tar
+    tar -xvf VOC2012test.tar
+
+    cd ../../
+    ```
+
+- change traning config
+
+  open the [voc config file](voc.py) and set some file paths as below:
+
+  ```
+  data_root = 'data/PASCAL_VOC2012'
+  img_prefix = 'raw/VOCdevkit/VOC2012'
+  train_ann = 'dsdl/dsdl_SemSeg_full/set-train/train.yaml'
+  val_ann = 'dsdl/dsdl_SemSeg_full/set-val/val.yaml'
+  ```
+
+  as dsdl datasets with one task using one dataloader, we can simplly change these file paths to train a model on a different dataset.
+
+- train:
+
+  - using single gpu:
+
+  ```
+  python tools/train.py {config_file}
+  ```
+
+  - using slrum:
+
+  ```
+  ./tools/slurm_train.sh {partition} {job_name} {config_file} {work_dir} {gpu_nums}
+  ```
+
+## Test Results
+
+|  Datasets  |                                                                                        Model                                                                                         | mIoU(%) |          Config           |
+| :--------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----: | :-----------------------: |
+|  voc2012   |    [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x512_20k_voc12aug/deeplabv3_r50-d8_512x512_20k_voc12aug_20200617_010906-596905ef.pth)    |  76.73  |    [config](./voc.py)     |
+| cityscapes | [model](https://download.openmmlab.com/mmsegmentation/v0.5/deeplabv3/deeplabv3_r50-d8_512x1024_40k_cityscapes/deeplabv3_r50-d8_512x1024_40k_cityscapes_20200605_022449-acadc2f8.pth) |  79.01  | [config](./cityscapes.py) |
diff --git a/configs/dsdl/cityscapes.py b/configs/dsdl/cityscapes.py
new file mode 100644
index 0000000000..94ccc068e0
--- /dev/null
+++ b/configs/dsdl/cityscapes.py
@@ -0,0 +1,70 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+# dataset settings
+dataset_type = 'DSDLSegDataset'
+data_root = 'data/CityScapes'
+img_prefix = 'raw/CityScapes'
+train_ann = 'dsdl/dsdl_SemSeg_full/set-train/train.yaml'
+val_ann = 'dsdl/dsdl_SemSeg_full/set-val/val.yaml'
+
+used_labels = [
+    'road', 'sidewalk', 'building', 'wall', 'fence', 'pole', 'traffic_light',
+    'traffic_sign', 'vegetation', 'terrain', 'sky', 'person', 'rider', 'car',
+    'truck', 'bus', 'train', 'motorcycle', 'bicycle'
+]
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path=img_prefix, seg_map_path=img_prefix),
+        ann_file=train_ann,
+        used_labels=used_labels,
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path=img_prefix, seg_map_path=img_prefix),
+        ann_file=val_ann,
+        used_labels=used_labels,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/configs/dsdl/voc.py b/configs/dsdl/voc.py
new file mode 100644
index 0000000000..c1895f7c7d
--- /dev/null
+++ b/configs/dsdl/voc.py
@@ -0,0 +1,65 @@
+_base_ = [
+    '../_base_/models/deeplabv3_r50-d8.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+
+# dataset settings
+dataset_type = 'DSDLSegDataset'
+data_root = 'data/PASCAL_VOC2012'
+img_prefix = 'raw/VOCdevkit/VOC2012'
+train_ann = 'dsdl/dsdl_SemSeg_full/set-train/train.yaml'
+val_ann = 'dsdl/dsdl_SemSeg_full/set-val/val.yaml'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path=img_prefix, seg_map_path=img_prefix),
+        ann_file=train_ann,
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path=img_prefix, seg_map_path=img_prefix),
+        ann_file=val_ann,
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
+
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=21),
+    auxiliary_head=dict(num_classes=21))
diff --git a/configs/emanet/README.md b/configs/emanet/README.md
index 3e5752b3b2..8ffaf471ca 100644
--- a/configs/emanet/README.md
+++ b/configs/emanet/README.md
@@ -1,6 +1,6 @@
 # EMANet
 
-[Expectation-Maximization Attention Networks for Semantic Segmentation](https://arxiv.org/abs/1907.13426)
+> [Expectation-Maximization Attention Networks for Semantic Segmentation](https://arxiv.org/abs/1907.13426)
 
 ## Introduction
 
@@ -22,6 +22,17 @@ Self-attention mechanism has been widely used for various tasks. It is designed
 <img src="https://user-images.githubusercontent.com/24582831/142901186-7bfe15e2-805a-420e-81b0-74f214f20a36.png" width="80%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | -------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| EMANet | R-50-D8  | 512x1024  |   80000 |      5.4 | 4.58           | V100   | 77.59 | 79.44         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/eemanet_r50-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes_20200901_100301-c43fcef1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes-20200901_100301.log.json)     |
+| EMANet | R-101-D8 | 512x1024  |   80000 |      6.2 | 2.87           | V100   | 79.10 | 81.21         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes_20200901_100301-2d970745.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes-20200901_100301.log.json) |
+| EMANet | R-50-D8  | 769x769   |   80000 |      8.9 | 1.97           | V100   | 79.33 | 80.49         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes_20200901_100301-16f8de52.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes-20200901_100301.log.json)         |
+| EMANet | R-101-D8 | 769x769   |   80000 |     10.1 | 1.22           | V100   | 79.62 | 81.00         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes_20200901_100301-47a324ce.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes-20200901_100301.log.json)     |
+
 ## Citation
 
 ```bibtex
@@ -33,14 +44,3 @@ Self-attention mechanism has been widely used for various tasks. It is designed
   year={2019}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| EMANet | R-50-D8  | 512x1024  |   80000 |      5.4 | 4.58           | 77.59 | 79.44         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes_20200901_100301-c43fcef1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes-20200901_100301.log.json)     |
-| EMANet | R-101-D8 | 512x1024  |   80000 |      6.2 | 2.87           | 79.10 | 81.21         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes_20200901_100301-2d970745.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes-20200901_100301.log.json) |
-| EMANet | R-50-D8  | 769x769   |   80000 |      8.9 | 1.97           | 79.33 | 80.49         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes_20200901_100301-16f8de52.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes-20200901_100301.log.json)         |
-| EMANet | R-101-D8 | 769x769   |   80000 |     10.1 | 1.22           | 79.62 | 81.00         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet/emanet_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes_20200901_100301-47a324ce.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes-20200901_100301.log.json)     |
diff --git a/configs/emanet/emanet.yml b/configs/emanet/emanet.yml
deleted file mode 100644
index 22ebcdb62a..0000000000
--- a/configs/emanet/emanet.yml
+++ /dev/null
@@ -1,103 +0,0 @@
-Collections:
-- Name: EMANet
-  Metadata:
-    Training Data:
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/1907.13426
-    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
-  README: configs/emanet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
-    Version: v0.17.0
-  Converted From:
-    Code: https://xialipku.github.io/EMANet
-Models:
-- Name: emanet_r50-d8_512x1024_80k_cityscapes
-  In Collection: EMANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 218.34
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.59
-      mIoU(ms+flip): 79.44
-  Config: configs/emanet/emanet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes_20200901_100301-c43fcef1.pth
-- Name: emanet_r101-d8_512x1024_80k_cityscapes
-  In Collection: EMANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 348.43
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.1
-      mIoU(ms+flip): 81.21
-  Config: configs/emanet/emanet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes_20200901_100301-2d970745.pth
-- Name: emanet_r50-d8_769x769_80k_cityscapes
-  In Collection: EMANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 507.61
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 8.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.33
-      mIoU(ms+flip): 80.49
-  Config: configs/emanet/emanet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes_20200901_100301-16f8de52.pth
-- Name: emanet_r101-d8_769x769_80k_cityscapes
-  In Collection: EMANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 819.67
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.62
-      mIoU(ms+flip): 81.0
-  Config: configs/emanet/emanet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes_20200901_100301-47a324ce.pth
diff --git a/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..ee3a3b5167
--- /dev/null
+++ b/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './emanet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..7319a3e4b6
--- /dev/null
+++ b/configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './emanet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/emanet/emanet_r101-d8_512x1024_80k_cityscapes.py b/configs/emanet/emanet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 58f28b43f5..0000000000
--- a/configs/emanet/emanet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './emanet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/emanet/emanet_r101-d8_769x769_80k_cityscapes.py b/configs/emanet/emanet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index c5dbf20b0f..0000000000
--- a/configs/emanet/emanet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './emanet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/emanet/emanet_r50-d8_512x1024_80k_cityscapes.py b/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/emanet/emanet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/emanet/emanet_r50-d8_769x769_80k_cityscapes.py b/configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/emanet/emanet_r50-d8_769x769_80k_cityscapes.py
rename to configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/emanet/metafile.yaml b/configs/emanet/metafile.yaml
new file mode 100644
index 0000000000..b2a6b09ed7
--- /dev/null
+++ b/configs/emanet/metafile.yaml
@@ -0,0 +1,109 @@
+Collections:
+- Name: EMANet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  README: configs/emanet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: eemanet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: EMANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.59
+      mIoU(ms+flip): 79.44
+  Config: configs/emanet/eemanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EMANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes_20200901_100301-c43fcef1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_512x1024_80k_cityscapes/emanet_r50-d8_512x1024_80k_cityscapes-20200901_100301.log.json
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
+  Framework: PyTorch
+- Name: emanet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: EMANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.1
+      mIoU(ms+flip): 81.21
+  Config: configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EMANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes_20200901_100301-2d970745.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_512x1024_80k_cityscapes/emanet_r101-d8_512x1024_80k_cityscapes-20200901_100301.log.json
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
+  Framework: PyTorch
+- Name: emanet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: EMANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.33
+      mIoU(ms+flip): 80.49
+  Config: configs/emanet/emanet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EMANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes_20200901_100301-16f8de52.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r50-d8_769x769_80k_cityscapes/emanet_r50-d8_769x769_80k_cityscapes-20200901_100301.log.json
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
+  Framework: PyTorch
+- Name: emanet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: EMANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.62
+      mIoU(ms+flip): 81.0
+  Config: configs/emanet/emanet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EMANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes_20200901_100301-47a324ce.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/emanet/emanet_r101-d8_769x769_80k_cityscapes/emanet_r101-d8_769x769_80k_cityscapes-20200901_100301.log.json
+  Paper:
+    Title: Expectation-Maximization Attention Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.13426
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ema_head.py#L80
+  Framework: PyTorch
diff --git a/configs/encnet/README.md b/configs/encnet/README.md
index c191943a30..ff09bc32f8 100644
--- a/configs/encnet/README.md
+++ b/configs/encnet/README.md
@@ -1,6 +1,6 @@
 # EncNet
 
-[Context Encoding for Semantic Segmentation](https://arxiv.org/abs/1803.08904)
+> [Context Encoding for Semantic Segmentation](https://arxiv.org/abs/1803.08904)
 
 ## Introduction
 
@@ -22,6 +22,30 @@ Recent work has made significant progress in improving spatial resolution for pi
 <img src="https://user-images.githubusercontent.com/24582831/142901276-b364fbbf-3bdb-4000-9d31-b9a135e30935.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| EncNet | R-50-D8  | 512x1024  |   40000 | 8.6      | 4.58           | V100   | 75.67 |         77.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes_20200621_220958-68638a47.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes-20200621_220958.log.json)     |
+| EncNet | R-101-D8 | 512x1024  |   40000 | 12.1     | 2.66           | V100   | 75.81 |         77.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes_20200621_220933-35e0a3e8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes-20200621_220933.log.json) |
+| EncNet | R-50-D8  | 769x769   |   40000 | 9.8      | 1.82           | V100   | 76.24 |         77.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes_20200621_220958-3bcd2884.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes-20200621_220958.log.json)         |
+| EncNet | R-101-D8 | 769x769   |   40000 | 13.7     | 1.26           | V100   | 74.25 |         76.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes_20200621_220933-2fafed55.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes-20200621_220933.log.json)     |
+| EncNet | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 77.94 |         79.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes_20200622_003554-fc5c5624.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes-20200622_003554.log.json)     |
+| EncNet | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 78.55 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes_20200622_003555-1de64bec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes-20200622_003555.log.json) |
+| EncNet | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 77.44 |         78.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes_20200622_003554-55096dcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes-20200622_003554.log.json)         |
+| EncNet | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 76.10 |         76.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes_20200622_003555-470ef79d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes-20200622_003555.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| EncNet | R-50-D8  | 512x512   |   80000 | 10.1     | 22.81          | V100   | 39.53 |         41.17 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k_20200622_042412-44b46b04.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k-20200622_042412.log.json)         |
+| EncNet | R-101-D8 | 512x512   |   80000 | 13.6     | 14.87          | V100   | 42.11 |         43.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k_20200622_101128-dd35e237.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k-20200622_101128.log.json)     |
+| EncNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 40.10 |         41.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k_20200622_101059-b2db95e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k-20200622_101059.log.json)     |
+| EncNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 42.61 |         44.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k_20200622_073348-7989641f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k-20200622_073348.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,27 +57,3 @@ month = {June},
 year = {2018}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| EncNet | R-50-D8  | 512x1024  |   40000 | 8.6      | 4.58           | 75.67 |         77.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes_20200621_220958-68638a47.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes-20200621_220958.log.json)     |
-| EncNet | R-101-D8 | 512x1024  |   40000 | 12.1     | 2.66           | 75.81 |         77.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes_20200621_220933-35e0a3e8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes-20200621_220933.log.json) |
-| EncNet | R-50-D8  | 769x769   |   40000 | 9.8      | 1.82           | 76.24 |         77.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes_20200621_220958-3bcd2884.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes-20200621_220958.log.json)         |
-| EncNet | R-101-D8 | 769x769   |   40000 | 13.7     | 1.26           | 74.25 |         76.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes_20200621_220933-2fafed55.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes-20200621_220933.log.json)     |
-| EncNet | R-50-D8  | 512x1024  |   80000 | -        | -              | 77.94 |         79.13 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes_20200622_003554-fc5c5624.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes-20200622_003554.log.json)     |
-| EncNet | R-101-D8 | 512x1024  |   80000 | -        | -              | 78.55 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes_20200622_003555-1de64bec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes-20200622_003555.log.json) |
-| EncNet | R-50-D8  | 769x769   |   80000 | -        | -              | 77.44 |         78.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes_20200622_003554-55096dcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes-20200622_003554.log.json)         |
-| EncNet | R-101-D8 | 769x769   |   80000 | -        | -              | 76.10 |         76.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes_20200622_003555-470ef79d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes-20200622_003555.log.json)     |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| EncNet | R-50-D8  | 512x512   |   80000 | 10.1     | 22.81          | 39.53 |         41.17 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k_20200622_042412-44b46b04.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k-20200622_042412.log.json)         |
-| EncNet | R-101-D8 | 512x512   |   80000 | 13.6     | 14.87          | 42.11 |         43.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k_20200622_101128-dd35e237.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k-20200622_101128.log.json)     |
-| EncNet | R-50-D8  | 512x512   |  160000 | -        | -              | 40.10 |         41.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k_20200622_101059-b2db95e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k-20200622_101059.log.json)     |
-| EncNet | R-101-D8 | 512x512   |  160000 | -        | -              | 42.61 |         44.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet/encnet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k_20200622_073348-7989641f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k-20200622_073348.log.json) |
diff --git a/configs/encnet/encnet.yml b/configs/encnet/encnet.yml
deleted file mode 100644
index 18fb32a395..0000000000
--- a/configs/encnet/encnet.yml
+++ /dev/null
@@ -1,232 +0,0 @@
-Collections:
-- Name: EncNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-  Paper:
-    URL: https://arxiv.org/abs/1803.08904
-    Title: Context Encoding for Semantic Segmentation
-  README: configs/encnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/zhanghang1989/PyTorch-Encoding
-Models:
-- Name: encnet_r50-d8_512x1024_40k_cityscapes
-  In Collection: EncNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 218.34
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.67
-      mIoU(ms+flip): 77.08
-  Config: configs/encnet/encnet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes_20200621_220958-68638a47.pth
-- Name: encnet_r101-d8_512x1024_40k_cityscapes
-  In Collection: EncNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 375.94
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 12.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.81
-      mIoU(ms+flip): 77.21
-  Config: configs/encnet/encnet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes_20200621_220933-35e0a3e8.pth
-- Name: encnet_r50-d8_769x769_40k_cityscapes
-  In Collection: EncNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 549.45
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 9.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.24
-      mIoU(ms+flip): 77.85
-  Config: configs/encnet/encnet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes_20200621_220958-3bcd2884.pth
-- Name: encnet_r101-d8_769x769_40k_cityscapes
-  In Collection: EncNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 793.65
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 13.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.25
-      mIoU(ms+flip): 76.25
-  Config: configs/encnet/encnet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes_20200621_220933-2fafed55.pth
-- Name: encnet_r50-d8_512x1024_80k_cityscapes
-  In Collection: EncNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.94
-      mIoU(ms+flip): 79.13
-  Config: configs/encnet/encnet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes_20200622_003554-fc5c5624.pth
-- Name: encnet_r101-d8_512x1024_80k_cityscapes
-  In Collection: EncNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.55
-      mIoU(ms+flip): 79.47
-  Config: configs/encnet/encnet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes_20200622_003555-1de64bec.pth
-- Name: encnet_r50-d8_769x769_80k_cityscapes
-  In Collection: EncNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.44
-      mIoU(ms+flip): 78.72
-  Config: configs/encnet/encnet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes_20200622_003554-55096dcb.pth
-- Name: encnet_r101-d8_769x769_80k_cityscapes
-  In Collection: EncNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.1
-      mIoU(ms+flip): 76.97
-  Config: configs/encnet/encnet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes_20200622_003555-470ef79d.pth
-- Name: encnet_r50-d8_512x512_80k_ade20k
-  In Collection: EncNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 43.84
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 39.53
-      mIoU(ms+flip): 41.17
-  Config: configs/encnet/encnet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k_20200622_042412-44b46b04.pth
-- Name: encnet_r101-d8_512x512_80k_ade20k
-  In Collection: EncNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 67.25
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 13.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.11
-      mIoU(ms+flip): 43.61
-  Config: configs/encnet/encnet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k_20200622_101128-dd35e237.pth
-- Name: encnet_r50-d8_512x512_160k_ade20k
-  In Collection: EncNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 40.1
-      mIoU(ms+flip): 41.71
-  Config: configs/encnet/encnet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k_20200622_101059-b2db95e0.pth
-- Name: encnet_r101-d8_512x512_160k_ade20k
-  In Collection: EncNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.61
-      mIoU(ms+flip): 44.01
-  Config: configs/encnet/encnet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k_20200622_073348-7989641f.pth
diff --git a/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..13ab367be5
--- /dev/null
+++ b/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..7810ac440d
--- /dev/null
+++ b/configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..bec6bd907d
--- /dev/null
+++ b/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..e1f6409e63
--- /dev/null
+++ b/configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..9599f9c0d3
--- /dev/null
+++ b/configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/encnet/encnet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..a9edfc28a2
--- /dev/null
+++ b/configs/encnet/encnet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/encnet/encnet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..d2fbab59e3
--- /dev/null
+++ b/configs/encnet/encnet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..debe8c8331
--- /dev/null
+++ b/configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './encnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_512x1024_40k_cityscapes.py b/configs/encnet/encnet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index f34373d9eb..0000000000
--- a/configs/encnet/encnet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './encnet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_512x1024_80k_cityscapes.py b/configs/encnet/encnet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 0b0207b314..0000000000
--- a/configs/encnet/encnet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './encnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_512x512_160k_ade20k.py b/configs/encnet/encnet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 8fec6ba255..0000000000
--- a/configs/encnet/encnet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './encnet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_512x512_20k_voc12aug.py b/configs/encnet/encnet_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index c264af998b..0000000000
--- a/configs/encnet/encnet_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './encnet_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_512x512_40k_voc12aug.py b/configs/encnet/encnet_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index 8a6968ea58..0000000000
--- a/configs/encnet/encnet_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './encnet_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_512x512_80k_ade20k.py b/configs/encnet/encnet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 94151004ea..0000000000
--- a/configs/encnet/encnet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './encnet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_769x769_40k_cityscapes.py b/configs/encnet/encnet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index d6ade67b76..0000000000
--- a/configs/encnet/encnet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './encnet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r101-d8_769x769_80k_cityscapes.py b/configs/encnet/encnet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 55648c08b2..0000000000
--- a/configs/encnet/encnet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './encnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/encnet/encnet_r50-d8_512x1024_40k_cityscapes.py b/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/encnet/encnet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/encnet/encnet_r50-d8_769x769_40k_cityscapes.py b/configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/encnet/encnet_r50-d8_769x769_40k_cityscapes.py
rename to configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/encnet/encnet_r50-d8_512x1024_80k_cityscapes.py b/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/encnet/encnet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/encnet/encnet_r50-d8_769x769_80k_cityscapes.py b/configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/encnet/encnet_r50-d8_769x769_80k_cityscapes.py
rename to configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/encnet/encnet_r50-d8_512x512_160k_ade20k.py b/configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/encnet/encnet_r50-d8_512x512_160k_ade20k.py
rename to configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/encnet/encnet_r50-d8_512x512_20k_voc12aug.py b/configs/encnet/encnet_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/encnet/encnet_r50-d8_512x512_20k_voc12aug.py
rename to configs/encnet/encnet_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/encnet/encnet_r50-d8_512x512_40k_voc12aug.py b/configs/encnet/encnet_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/encnet/encnet_r50-d8_512x512_40k_voc12aug.py
rename to configs/encnet/encnet_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/encnet/encnet_r50-d8_512x512_80k_ade20k.py b/configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/encnet/encnet_r50-d8_512x512_80k_ade20k.py
rename to configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/encnet/encnet_r50s-d8_512x512_80k_ade20k.py b/configs/encnet/encnet_r50s-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/encnet/encnet_r50s-d8_512x512_80k_ade20k.py
rename to configs/encnet/encnet_r50s-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/encnet/metafile.yaml b/configs/encnet/metafile.yaml
new file mode 100644
index 0000000000..0dbdcfaab3
--- /dev/null
+++ b/configs/encnet/metafile.yaml
@@ -0,0 +1,296 @@
+Collections:
+- Name: EncNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  README: configs/encnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: encnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.67
+      mIoU(ms+flip): 77.08
+  Config: configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes_20200621_220958-68638a47.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_40k_cityscapes/encnet_r50-d8_512x1024_40k_cityscapes-20200621_220958.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.81
+      mIoU(ms+flip): 77.21
+  Config: configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes_20200621_220933-35e0a3e8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_40k_cityscapes/encnet_r101-d8_512x1024_40k_cityscapes-20200621_220933.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.24
+      mIoU(ms+flip): 77.85
+  Config: configs/encnet/encnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes_20200621_220958-3bcd2884.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_40k_cityscapes/encnet_r50-d8_769x769_40k_cityscapes-20200621_220958.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.25
+      mIoU(ms+flip): 76.25
+  Config: configs/encnet/encnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes_20200621_220933-2fafed55.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_40k_cityscapes/encnet_r101-d8_769x769_40k_cityscapes-20200621_220933.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.94
+      mIoU(ms+flip): 79.13
+  Config: configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes_20200622_003554-fc5c5624.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x1024_80k_cityscapes/encnet_r50-d8_512x1024_80k_cityscapes-20200622_003554.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.55
+      mIoU(ms+flip): 79.47
+  Config: configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes_20200622_003555-1de64bec.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x1024_80k_cityscapes/encnet_r101-d8_512x1024_80k_cityscapes-20200622_003555.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.44
+      mIoU(ms+flip): 78.72
+  Config: configs/encnet/encnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes_20200622_003554-55096dcb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_769x769_80k_cityscapes/encnet_r50-d8_769x769_80k_cityscapes-20200622_003554.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.1
+      mIoU(ms+flip): 76.97
+  Config: configs/encnet/encnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes_20200622_003555-470ef79d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_769x769_80k_cityscapes/encnet_r101-d8_769x769_80k_cityscapes-20200622_003555.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.53
+      mIoU(ms+flip): 41.17
+  Config: configs/encnet/encnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k_20200622_042412-44b46b04.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_80k_ade20k/encnet_r50-d8_512x512_80k_ade20k-20200622_042412.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.11
+      mIoU(ms+flip): 43.61
+  Config: configs/encnet/encnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k_20200622_101128-dd35e237.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_80k_ade20k/encnet_r101-d8_512x512_80k_ade20k-20200622_101128.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.1
+      mIoU(ms+flip): 41.71
+  Config: configs/encnet/encnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k_20200622_101059-b2db95e0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r50-d8_512x512_160k_ade20k/encnet_r50-d8_512x512_160k_ade20k-20200622_101059.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
+- Name: encnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: EncNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.61
+      mIoU(ms+flip): 44.01
+  Config: configs/encnet/encnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k_20200622_073348-7989641f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/encnet/encnet_r101-d8_512x512_160k_ade20k/encnet_r101-d8_512x512_160k_ade20k-20200622_073348.log.json
+  Paper:
+    Title: Context Encoding for Semantic Segmentation
+    URL: https://arxiv.org/abs/1803.08904
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/enc_head.py#L63
+  Framework: PyTorch
diff --git a/configs/erfnet/README.md b/configs/erfnet/README.md
index bcb61d3d6f..55d71973a3 100644
--- a/configs/erfnet/README.md
+++ b/configs/erfnet/README.md
@@ -1,6 +1,6 @@
 # ERFNet
 
-[ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation](http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf)
+> [ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation](http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf)
 
 ## Introduction
 
@@ -22,6 +22,22 @@ Semantic segmentation is a challenging task that addresses most of the perceptio
 <img src="https://user-images.githubusercontent.com/24582831/143479729-ea7951f6-1a3c-47d6-aaee-62c5759c0638.png" width="60%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ---: | ------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ERFNet | ERFNet   | 512x1024  |  160000 | 6.04     | 15.26          | V100   | 72.5 | 74.75         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145-dc90157a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145.log.json) |
+
+Note:
+
+- The model is trained from scratch.
+
+- Last deconvolution layer in the [original paper](https://github.com/Eromera/erfnet_pytorch/blob/master/train/erfnet.py#L123) is replaced by a naive `FCNHead` decoder head and a bilinear upsampling layer, found more effective and efficient.
+
+- This model performance is sensitive to the seed values used, please refer to the log file for the specific settings of the seed. If you choose a different seed, the results might differ from the table results.
+
 ## Citation
 
 ```bibtex
@@ -36,17 +52,3 @@ Semantic segmentation is a challenging task that addresses most of the perceptio
   publisher={IEEE}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| ERFNet | ERFNet   | 512x1024  |  160000 | 6.04     | 15.26          | 71.08 | 72.6          | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056-03d333ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056.log.json) |
-
-Note:
-
-- The model is trained from scratch.
-
-- Last deconvolution layer in the [original paper](https://github.com/Eromera/erfnet_pytorch/blob/master/train/erfnet.py#L123) is replaced by a naive `FCNHead` decoder head and a bilinear upsampling layer, found more effective and efficient.
diff --git a/configs/erfnet/erfnet.yml b/configs/erfnet/erfnet.yml
deleted file mode 100644
index e4c34f9c5b..0000000000
--- a/configs/erfnet/erfnet.yml
+++ /dev/null
@@ -1,37 +0,0 @@
-Collections:
-- Name: ERFNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-  Paper:
-    URL: http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf
-    Title: 'ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation'
-  README: configs/erfnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/erfnet.py#L321
-    Version: v0.20.0
-  Converted From:
-    Code: https://github.com/Eromera/erfnet_pytorch
-Models:
-- Name: erfnet_fcn_4x4_512x1024_160k_cityscapes
-  In Collection: ERFNet
-  Metadata:
-    backbone: ERFNet
-    crop size: (512,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 65.53
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.04
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 71.08
-      mIoU(ms+flip): 72.6
-  Config: configs/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056-03d333ed.pth
diff --git a/configs/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes.py b/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
similarity index 100%
rename from configs/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes.py
rename to configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
diff --git a/configs/erfnet/metafile.yaml b/configs/erfnet/metafile.yaml
new file mode 100644
index 0000000000..bf514124ee
--- /dev/null
+++ b/configs/erfnet/metafile.yaml
@@ -0,0 +1,37 @@
+Collections:
+- Name: ERFNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: 'ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation'
+    URL: http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf
+  README: configs/erfnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: erfnet_fcn_4xb4-160k_cityscapes-512x1024
+  In Collection: ERFNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.5
+      mIoU(ms+flip): 74.75
+  Config: configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - ERFNet
+    - ERFNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.04
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145-dc90157a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145.log.json
+  Paper:
+    Title: 'ERFNet: Efficient Residual Factorized ConvNet for Real-time Semantic Segmentation'
+    URL: http://www.robesafe.uah.es/personal/eduardo.romera/pdfs/Romera17tits.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/erfnet.py#L321
+  Framework: PyTorch
diff --git a/configs/fastfcn/README.md b/configs/fastfcn/README.md
index d772bd2402..48644e57e3 100644
--- a/configs/fastfcn/README.md
+++ b/configs/fastfcn/README.md
@@ -1,6 +1,6 @@
 # FastFCN
 
-[FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation](https://arxiv.org/abs/1903.11816)
+> [FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation](https://arxiv.org/abs/1903.11816)
 
 ## Introduction
 
@@ -22,42 +22,42 @@ Modern approaches for semantic segmentation usually employ dilated convolutions
 <img src="https://user-images.githubusercontent.com/24582831/142901374-6e0252ab-6e0f-4acd-86ad-1e9f49be3185.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{wu2019fastfcn,
-title={Fastfcn: Rethinking dilated convolution in the backbone for semantic segmentation},
-author={Wu, Huikai and Zhang, Junge and Huang, Kaiqi and Liang, Kongming and Yu, Yizhou},
-journal={arXiv preprint arXiv:1903.11816},
-year={2019}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method                    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                                                                           |
-| ------------------------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FastFCN + DeepLabV3       | R-50-D32 | 512x1024  |   80000 | 5.67     | 2.64           | 79.12 | 80.58         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722-5d1a2648.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722.log.json)                 |
-| FastFCN + DeepLabV3 (4x4) | R-50-D32 | 512x1024  |   80000 | 9.79     | -              | 79.52 | 80.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357-72220849.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357.log.json) |
-| FastFCN + PSPNet          | R-50-D32 | 512x1024  |   80000 | 5.67     | 4.40           | 79.26 | 80.86         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722-57749bed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722.log.json)                     |
-| FastFCN + PSPNet (4x4)    | R-50-D32 | 512x1024  |   80000 | 9.94     | -              | 78.76 | 80.03         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841-77e87b0a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841.log.json)     |
-| FastFCN + EncNet          | R-50-D32 | 512x1024  |   80000 | 8.15     | 4.77           | 77.97 | 79.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036-78da5046.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036.log.json)                     |
-| FastFCN + EncNet (4x4)    | R-50-D32 | 512x1024  |   80000 | 15.45    | -              |  78.6 | 80.25         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217-e1eb6dbb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217.log.json)     |
+| Method              | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                                                           |
+| ------------------- | -------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FastFCN + DeepLabV3 | R-50-D32       | 512x1024  |   80000 | 5.67     | 2.64           | V100   | 79.12 | 80.58         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722-5d1a2648.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722.log.json)                 |
+| FastFCN + DeepLabV3 | R-50-D32 (4x4) | 512x1024  |   80000 | 9.79     | -              | V100   | 79.52 | 80.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357-72220849.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357.log.json) |
+| FastFCN + PSPNet    | R-50-D32       | 512x1024  |   80000 | 5.67     | 4.40           | V100   | 79.26 | 80.86         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722-57749bed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722.log.json)                     |
+| FastFCN + PSPNet    | R-50-D32 (4x4) | 512x1024  |   80000 | 9.94     | -              | V100   | 78.76 | 80.03         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841-77e87b0a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841.log.json)     |
+| FastFCN + EncNet    | R-50-D32       | 512x1024  |   80000 | 8.15     | 4.77           | V100   | 77.97 | 79.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036-78da5046.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036.log.json)                     |
+| FastFCN + EncNet    | R-50-D32 (4x4) | 512x1024  |   80000 | 15.45    | -              | V100   |  78.6 | 80.25         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217-e1eb6dbb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217.log.json)     |
 
 ### ADE20K
 
-| Method              | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                           |
-| ------------------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FastFCN + DeepLabV3 | R-50-D32 | 512x1024  |   80000 | 8.46     | 12.06          | 41.88 | 42.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619-3aa40f2d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619.log.json)     |
-| FastFCN + DeepLabV3 | R-50-D32 | 512x1024  |  160000 | -        | -              | 43.58 | 44.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246-27036aee.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246.log.json) |
-| FastFCN + PSPNet    | R-50-D32 | 512x1024  |   80000 | 8.02     | 19.21          | 41.40 | 42.12         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137-993d07c8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137.log.json)         |
-| FastFCN + PSPNet    | R-50-D32 | 512x1024  |  160000 | -        | -              | 42.63 | 43.71         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455-e8f5a2fd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455.log.json)     |
-| FastFCN + EncNet    | R-50-D32 | 512x1024  |   80000 | 9.67     | 17.23          | 40.88 | 42.36         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214-65aef6dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214.log.json)         |
-| FastFCN + EncNet    | R-50-D32 | 512x1024  |  160000 | -        | -              | 42.50 | 44.21         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456-d875ce3c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456.log.json)     |
+| Method              | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                           |
+| ------------------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FastFCN + DeepLabV3 | R-50-D32 | 512x1024  |   80000 | 8.46     | 12.06          | V100   | 41.88 | 42.91         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619-3aa40f2d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619.log.json)     |
+| FastFCN + DeepLabV3 | R-50-D32 | 512x1024  |  160000 | -        | -              | V100   | 43.58 | 44.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246-27036aee.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246.log.json) |
+| FastFCN + PSPNet    | R-50-D32 | 512x1024  |   80000 | 8.02     | 19.21          | V100   | 41.40 | 42.12         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137-993d07c8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137.log.json)         |
+| FastFCN + PSPNet    | R-50-D32 | 512x1024  |  160000 | -        | -              | V100   | 42.63 | 43.71         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455-e8f5a2fd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455.log.json)     |
+| FastFCN + EncNet    | R-50-D32 | 512x1024  |   80000 | 9.67     | 17.23          | V100   | 40.88 | 42.36         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214-65aef6dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214.log.json)         |
+| FastFCN + EncNet    | R-50-D32 | 512x1024  |  160000 | -        | -              | V100   | 42.50 | 44.21         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456-d875ce3c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456.log.json)     |
 
 Note:
 
 - `4x4` means 4 GPUs with 4 samples per GPU in training, default setting is 4 GPUs with 2 samples per GPU in training.
-- Results of [DeepLabV3 (mIoU: 79.32)](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3), [PSPNet (mIoU: 78.55)](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet) and [ENCNet (mIoU: 77.94)](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet) can be found in each original repository.
+- Results of [DeepLabV3 (mIoU: 79.32)](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3), [PSPNet (mIoU: 78.55)](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet) and [ENCNet (mIoU: 77.94)](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet) can be found in each original repository.
+
+## Citation
+
+```bibtex
+@article{wu2019fastfcn,
+title={Fastfcn: Rethinking dilated convolution in the backbone for semantic segmentation},
+author={Wu, Huikai and Zhang, Junge and Huang, Kaiqi and Liang, Kongming and Yu, Yizhou},
+journal={arXiv preprint arXiv:1903.11816},
+year={2019}
+}
+```
diff --git a/configs/fastfcn/fastfcn.yml b/configs/fastfcn/fastfcn.yml
deleted file mode 100644
index 6fdc556588..0000000000
--- a/configs/fastfcn/fastfcn.yml
+++ /dev/null
@@ -1,235 +0,0 @@
-Collections:
-- Name: FastFCN
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-  Paper:
-    URL: https://arxiv.org/abs/1903.11816
-    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
-  README: configs/fastfcn/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
-    Version: v0.18.0
-  Converted From:
-    Code: https://github.com/wuhuikai/FastFCN
-Models:
-- Name: fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 378.79
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.67
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.12
-      mIoU(ms+flip): 80.58
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722-5d1a2648.pth
-- Name: fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    Training Memory (GB): 9.79
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.52
-      mIoU(ms+flip): 80.91
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357-72220849.pth
-- Name: fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 227.27
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.67
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.26
-      mIoU(ms+flip): 80.86
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722-57749bed.pth
-- Name: fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    Training Memory (GB): 9.94
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.76
-      mIoU(ms+flip): 80.03
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841-77e87b0a.pth
-- Name: fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 209.64
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.15
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.97
-      mIoU(ms+flip): 79.92
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036-78da5046.pth
-- Name: fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    Training Memory (GB): 15.45
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.6
-      mIoU(ms+flip): 80.25
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217-e1eb6dbb.pth
-- Name: fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 82.92
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.46
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.88
-      mIoU(ms+flip): 42.91
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619-3aa40f2d.pth
-- Name: fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.58
-      mIoU(ms+flip): 44.92
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246-27036aee.pth
-- Name: fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 52.06
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.02
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.4
-      mIoU(ms+flip): 42.12
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137-993d07c8.pth
-- Name: fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.63
-      mIoU(ms+flip): 43.71
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455-e8f5a2fd.pth
-- Name: fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 58.04
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.67
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 40.88
-      mIoU(ms+flip): 42.36
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214-65aef6dd.pth
-- Name: fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k
-  In Collection: FastFCN
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.5
-      mIoU(ms+flip): 44.21
-  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456-d875ce3c.pth
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes.py b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes.py
deleted file mode 100644
index 6fbca14bac..0000000000
--- a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# model settings
-_base_ = './fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes.py'
-train_dataloader = dict(batch_size=4, num_workers=4)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..39e6e236b7
--- /dev/null
+++ b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,20 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='ASPPHead',
+        in_channels=2048,
+        in_index=2,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..1913544cfb
--- /dev/null
+++ b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,20 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='ASPPHead',
+        in_channels=2048,
+        in_index=2,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..751689599d
--- /dev/null
+++ b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,20 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='ASPPHead',
+        in_channels=2048,
+        in_index=2,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_cityscapes-512x1024.py b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..a8c5dc3232
--- /dev/null
+++ b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_cityscapes-512x1024.py
@@ -0,0 +1,5 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py'
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes.py b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes.py
deleted file mode 100644
index dc86da3b6f..0000000000
--- a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# model settings
-_base_ = './fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    decode_head=dict(
-        _delete_=True,
-        type='ASPPHead',
-        in_channels=2048,
-        in_index=2,
-        channels=512,
-        dilations=(1, 12, 24, 36),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k.py b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k.py
deleted file mode 100644
index dbf9f80272..0000000000
--- a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# model settings
-_base_ = './fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    decode_head=dict(
-        _delete_=True,
-        type='ASPPHead',
-        in_channels=2048,
-        in_index=2,
-        channels=512,
-        dilations=(1, 12, 24, 36),
-        dropout_ratio=0.1,
-        num_classes=150,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k.py b/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k.py
deleted file mode 100644
index b14b1f68c7..0000000000
--- a/configs/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# model settings
-_base_ = './fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    decode_head=dict(
-        _delete_=True,
-        type='ASPPHead',
-        in_channels=2048,
-        in_index=2,
-        channels=512,
-        dilations=(1, 12, 24, 36),
-        dropout_ratio=0.1,
-        num_classes=150,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes.py b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes.py
deleted file mode 100644
index 839d540377..0000000000
--- a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# model settings
-_base_ = './fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes.py'
-train_dataloader = dict(batch_size=4, num_workers=4)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..4840dd0287
--- /dev/null
+++ b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,24 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='EncHead',
+        in_channels=[512, 1024, 2048],
+        in_index=(0, 1, 2),
+        channels=512,
+        num_codes=32,
+        use_se_loss=True,
+        add_lateral=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_se_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..619d0862f1
--- /dev/null
+++ b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,24 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='EncHead',
+        in_channels=[512, 1024, 2048],
+        in_index=(0, 1, 2),
+        channels=512,
+        num_codes=32,
+        use_se_loss=True,
+        add_lateral=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_se_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..a76b026b6a
--- /dev/null
+++ b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,24 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='EncHead',
+        in_channels=[512, 1024, 2048],
+        in_index=(0, 1, 2),
+        channels=512,
+        num_codes=32,
+        use_se_loss=True,
+        add_lateral=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        loss_se_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_cityscapes-512x1024.py b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..6df1527272
--- /dev/null
+++ b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_cityscapes-512x1024.py
@@ -0,0 +1,5 @@
+# model settings
+_base_ = './fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py'
+train_dataloader = dict(batch_size=4, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes.py b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes.py
deleted file mode 100644
index cc68edfe5b..0000000000
--- a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# model settings
-_base_ = './fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    decode_head=dict(
-        _delete_=True,
-        type='EncHead',
-        in_channels=[512, 1024, 2048],
-        in_index=(0, 1, 2),
-        channels=512,
-        num_codes=32,
-        use_se_loss=True,
-        add_lateral=False,
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-        loss_se_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k.py b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k.py
deleted file mode 100644
index 12f0add5ad..0000000000
--- a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# model settings
-_base_ = './fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    decode_head=dict(
-        _delete_=True,
-        type='EncHead',
-        in_channels=[512, 1024, 2048],
-        in_index=(0, 1, 2),
-        channels=512,
-        num_codes=32,
-        use_se_loss=True,
-        add_lateral=False,
-        dropout_ratio=0.1,
-        num_classes=150,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-        loss_se_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k.py b/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k.py
deleted file mode 100644
index d3e2e9c80b..0000000000
--- a/configs/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# model settings
-_base_ = './fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    decode_head=dict(
-        _delete_=True,
-        type='EncHead',
-        in_channels=[512, 1024, 2048],
-        in_index=(0, 1, 2),
-        channels=512,
-        num_codes=32,
-        use_se_loss=True,
-        add_lateral=False,
-        dropout_ratio=0.1,
-        num_classes=150,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
-        loss_se_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.2)),
-    # model training and testing settings
-    train_cfg=dict(),
-    test_cfg=dict(mode='whole'))
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes.py b/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes.py
rename to configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k.py b/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k.py
rename to configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k.py b/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k.py
rename to configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py
diff --git a/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes.py b/configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes.py
rename to configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_cityscapes-512x1024.py
diff --git a/configs/fastfcn/metafile.yaml b/configs/fastfcn/metafile.yaml
new file mode 100644
index 0000000000..f5fe03ca45
--- /dev/null
+++ b/configs/fastfcn/metafile.yaml
@@ -0,0 +1,311 @@
+Collections:
+- Name: FastFCN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  README: configs/fastfcn/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.12
+      mIoU(ms+flip): 80.58
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722-5d1a2648.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_512x1024_80k_cityscapes_20210928_053722.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.52
+      mIoU(ms+flip): 80.91
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.79
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357-72220849.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_aspp_4x4_512x1024_80k_cityscapes_20210924_214357.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.26
+      mIoU(ms+flip): 80.86
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722-57749bed.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_512x1024_80k_cityscapes_20210928_053722.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.76
+      mIoU(ms+flip): 80.03
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.94
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841-77e87b0a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_psp_4x4_512x1024_80k_cityscapes_20210925_061841.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.97
+      mIoU(ms+flip): 79.92
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.15
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036-78da5046.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_512x1024_80k_cityscapes_20210928_030036.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.6
+      mIoU(ms+flip): 80.25
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 15.45
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217-e1eb6dbb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes/fastfcn_r50-d32_jpu_enc_4x4_512x1024_80k_cityscapes_20210926_093217.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.88
+      mIoU(ms+flip): 42.91
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.46
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619-3aa40f2d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_80k_ade20k_20211013_190619.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.58
+      mIoU(ms+flip): 44.92
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_aspp_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246-27036aee.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_aspp_512x512_160k_ade20k_20211008_152246.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.4
+      mIoU(ms+flip): 42.12
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.02
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137-993d07c8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_80k_ade20k_20210930_225137.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.63
+      mIoU(ms+flip): 43.71
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_psp_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455-e8f5a2fd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k/fastfcn_r50-d32_jpu_psp_512x512_160k_ade20k_20211008_105455.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.88
+      mIoU(ms+flip): 42.36
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - EncNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214-65aef6dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_80k_ade20k_20210930_225214.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
+- Name: fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512
+  In Collection: FastFCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.5
+      mIoU(ms+flip): 44.21
+  Config: configs/fastfcn/fastfcn_r50-d32_jpu_enc_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - FastFCN
+    - EncNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456-d875ce3c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fastfcn/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k/fastfcn_r50-d32_jpu_enc_512x512_160k_ade20k_20211008_105456.log.json
+  Paper:
+    Title: 'FastFCN: Rethinking Dilated Convolution in the Backbone for Semantic Segmentation'
+    URL: https://arxiv.org/abs/1903.11816
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/jpu.py#L12
+  Framework: PyTorch
diff --git a/configs/fastscnn/README.md b/configs/fastscnn/README.md
index 156562670d..6be981462a 100644
--- a/configs/fastscnn/README.md
+++ b/configs/fastscnn/README.md
@@ -1,6 +1,6 @@
 # Fast-SCNN
 
-[Fast-SCNN for Semantic Segmentation](https://arxiv.org/abs/1902.04502)
+> [Fast-SCNN for Semantic Segmentation](https://arxiv.org/abs/1902.04502)
 
 ## Introduction
 
@@ -22,6 +22,14 @@ The encoder-decoder framework is state-of-the-art for offline semantic image seg
 <img src="https://user-images.githubusercontent.com/24582831/142901444-705b4ff4-6d1e-409b-899a-37bf3a6b69ce.png" width="80%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method   | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                               |
+| -------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FastSCNN | FastSCNN | 512x1024  |  160000 | 3.3      | 56.45          | V100   | 70.96 | 72.65         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -32,11 +40,3 @@ The encoder-decoder framework is state-of-the-art for offline semantic image seg
   year={2019}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method   | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                               |
-| -------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FastSCNN | FastSCNN | 512x1024  |  160000 | 3.3      | 56.45          | 70.96 | 72.65         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastscnn/fast_scnn_lr0.12_8x4_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json) |
diff --git a/configs/fastscnn/fast_scnn_lr0.12_8x4_160k_cityscapes.py b/configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py
similarity index 100%
rename from configs/fastscnn/fast_scnn_lr0.12_8x4_160k_cityscapes.py
rename to configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py
diff --git a/configs/fastscnn/fastscnn.yml b/configs/fastscnn/fastscnn.yml
deleted file mode 100644
index cad0360744..0000000000
--- a/configs/fastscnn/fastscnn.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-Collections:
-- Name: FastSCNN
-  Metadata:
-    Training Data:
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/1902.04502
-    Title: Fast-SCNN for Semantic Segmentation
-  README: configs/fastscnn/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/fast_scnn.py#L272
-    Version: v0.17.0
-Models:
-- Name: fast_scnn_lr0.12_8x4_160k_cityscapes
-  In Collection: FastSCNN
-  Metadata:
-    backbone: FastSCNN
-    crop size: (512,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 17.71
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 70.96
-      mIoU(ms+flip): 72.65
-  Config: configs/fastscnn/fast_scnn_lr0.12_8x4_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth
diff --git a/configs/fastscnn/metafile.yaml b/configs/fastscnn/metafile.yaml
new file mode 100644
index 0000000000..9e33c902db
--- /dev/null
+++ b/configs/fastscnn/metafile.yaml
@@ -0,0 +1,37 @@
+Collections:
+- Name: FastSCNN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Fast-SCNN for Semantic Segmentation
+    URL: https://arxiv.org/abs/1902.04502
+  README: configs/fastscnn/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: fast_scnn_8xb4-160k_cityscapes-512x1024
+  In Collection: FastSCNN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.96
+      mIoU(ms+flip): 72.65
+  Config: configs/fastscnn/fast_scnn_8xb4-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 32
+    Architecture:
+    - FastSCNN
+    - FastSCNN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 3.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853-0cec9937.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fast_scnn/fast_scnn_lr0.12_8x4_160k_cityscapes/fast_scnn_lr0.12_8x4_160k_cityscapes_20210630_164853.log.json
+  Paper:
+    Title: Fast-SCNN for Semantic Segmentation
+    URL: https://arxiv.org/abs/1902.04502
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/fast_scnn.py#L272
+  Framework: PyTorch
diff --git a/configs/fcn/README.md b/configs/fcn/README.md
index 09ca1a50dc..cf7379ff3d 100644
--- a/configs/fcn/README.md
+++ b/configs/fcn/README.md
@@ -1,6 +1,6 @@
 # FCN
 
-[Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1411.4038)
+> [Fully Convolutional Networks for Semantic Segmentation](https://arxiv.org/abs/1411.4038)
 
 ## Introduction
 
@@ -22,90 +22,90 @@ Convolutional networks are powerful visual models that yield hierarchies of feat
 <img src="https://user-images.githubusercontent.com/24582831/142901525-fd0d2bf4-9a47-4143-85f5-3cee8849eaa4.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{shelhamer2017fully,
-  title={Fully convolutional networks for semantic segmentation},
-  author={Shelhamer, Evan and Long, Jonathan and Darrell, Trevor},
-  journal={IEEE transactions on pattern analysis and machine intelligence},
-  volume={39},
-  number={4},
-  pages={640--651},
-  year={2017},
-  publisher={IEEE Trans Pattern Anal Mach Intell}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method     | Backbone   | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                   |
-| ---------- | ---------- | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN        | R-50-D8    | 512x1024  |   40000 | 5.7      | 4.17           | 72.25 |         73.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608.log.json)                         |
-| FCN        | R-101-D8   | 512x1024  |   40000 | 9.2      | 2.66           | 75.45 |         76.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_512x1024_40k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852-a883d3a1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852.log.json)                     |
-| FCN        | R-50-D8    | 769x769   |   40000 | 6.5      | 1.80           | 71.47 |         72.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50-d8_769x769_40k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104-977b5d02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104.log.json)                             |
-| FCN        | R-101-D8   | 769x769   |   40000 | 10.4     | 1.19           | 73.93 |         75.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_769x769_40k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208-7d4ab69c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208.log.json)                         |
-| FCN        | R-18-D8    | 512x1024  |   80000 | 1.7      | 14.65          | 71.11 |         72.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r18-d8_512x1024_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes_20201225_021327-6c50f8b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes-20201225_021327.log.json)                         |
-| FCN        | R-50-D8    | 512x1024  |   80000 | -        |                | 73.61 |         74.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50-d8_512x1024_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019-03aa804d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019.log.json)                         |
-| FCN        | R-101-D8   | 512x1024  |   80000 | -        | -              | 75.13 |         75.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_512x1024_80k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038-3fb937eb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038.log.json)                     |
-| FCN (FP16) | R-101-D8   | 512x1024  |   80000 | 5.37     | 8.64           | 76.80 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921-fb13e883.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921.log.json) |
-| FCN        | R-18-D8    | 769x769   |   80000 | 1.9      | 6.40           | 70.80 |         73.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r18-d8_769x769_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes_20201225_021451-9739d1b8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes-20201225_021451.log.json)                             |
-| FCN        | R-50-D8    | 769x769   |   80000 | -        | -              | 72.64 |         73.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50-d8_769x769_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749-f5caeabc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749.log.json)                             |
-| FCN        | R-101-D8   | 769x769   |   80000 | -        | -              | 75.52 |         76.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_769x769_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354-45cbac68.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354.log.json)                         |
-| FCN        | R-18b-D8   | 512x1024  |   80000 | 1.6      | 16.74          | 70.24 |         72.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r18b-d8_512x1024_80k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes_20201225_230143-92c0f445.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes-20201225_230143.log.json)                     |
-| FCN        | R-50b-D8   | 512x1024  |   80000 | 5.6      | 4.20           | 75.65 |         77.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50b-d8_512x1024_80k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes_20201225_094221-82957416.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes-20201225_094221.log.json)                     |
-| FCN        | R-101b-D8  | 512x1024  |   80000 | 9.1      | 2.73           | 77.37 |         78.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101b-d8_512x1024_80k_cityscapes.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes_20201226_160213-4543858f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes-20201226_160213.log.json)                 |
-| FCN        | R-18b-D8   | 769x769   |   80000 | 1.7      | 6.70           | 69.66 |         72.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r18b-d8_769x769_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes_20201226_004430-32d504e5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes-20201226_004430.log.json)                         |
-| FCN        | R-50b-D8   | 769x769   |   80000 | 6.3      | 1.82           | 73.83 |         76.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50b-d8_769x769_80k_cityscapes.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes_20201225_094223-94552d38.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes-20201225_094223.log.json)                         |
-| FCN        | R-101b-D8  | 769x769   |   80000 | 10.3     | 1.15           | 77.02 |         78.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101b-d8_769x769_80k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes_20201226_170012-82be37e2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes-20201226_170012.log.json)                     |
-| FCN (D6)   | R-50-D16   | 512x1024  |   40000 | 3.4      | 10.22          | 77.06 |         78.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes_20210305_130133-98d5d1bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes-20210305_130133.log.json)         |
-| FCN (D6)   | R-50-D16   | 512x1024  |   80000 | -        | 10.35          | 77.27 |         78.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes_20210306_115604-133c292f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes-20210306_115604.log.json)         |
-| FCN (D6)   | R-50-D16   | 769x769   |   40000 | 3.7      | 4.17           | 76.82 |         78.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes_20210305_185744-1aab18ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes-20210305_185744.log.json)             |
-| FCN (D6)   | R-50-D16   | 769x769   |   80000 | -        | 4.15           | 77.04 |         78.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes_20210305_200413-109d88eb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes-20210305_200413.log.json)             |
-| FCN (D6)   | R-101-D16  | 512x1024  |   40000 | 4.5      | 8.04           | 77.36 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes_20210305_130337-9cf2b450.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes-20210305_130337.log.json)     |
-| FCN (D6)   | R-101-D16  | 512x1024  |   80000 | -        | 8.26           | 78.46 |         80.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes_20210308_102747-cb336445.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes-20210308_102747.log.json)     |
-| FCN (D6)   | R-101-D16  | 769x769   |   40000 | 5.0      | 3.12           | 77.28 |         78.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes_20210308_102453-60b114e9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes-20210308_102453.log.json)         |
-| FCN (D6)   | R-101-D16  | 769x769   |   80000 | -        | 3.21           | 78.06 |         79.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes_20210306_120016-e33adc4f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes-20210306_120016.log.json)         |
-| FCN (D6)   | R-50b-D16  | 512x1024  |   80000 | 3.2      | 10.16          | 76.99 |         79.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes/fcn_d6_r50b-d16_512x1024_80k_cityscapes_20210311_125550-6a0b62e9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_512x1024_80k_cityscapes/fcn_d6_r50b_d16_512x1024_80k_cityscapes-20210311_125550.log.json)     |
-| FCN (D6)   | R-50b-D16  | 769x769   |   80000 | 3.6      | 4.17           | 76.86 |         78.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes/fcn_d6_r50b-d16_769x769_80k_cityscapes_20210311_131012-d665f231.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_769x769_80k_cityscapes/fcn_d6_r50b_d16_769x769_80k_cityscapes-20210311_131012.log.json)         |
-| FCN (D6)   | R-101b-D16 | 512x1024  |   80000 | 4.3      | 8.46           | 77.72 |         79.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes/fcn_d6_r101b-d16_512x1024_80k_cityscapes_20210311_144305-3f2eb5b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_512x1024_80k_cityscapes/fcn_d6_r101b_d16_512x1024_80k_cityscapes-20210311_144305.log.json) |
-| FCN (D6)   | R-101b-D16 | 769x769   |   80000 | 4.8      | 3.32           | 77.34 |         78.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes/fcn_d6_r101b-d16_769x769_80k_cityscapes_20210311_154527-c4d8bfbc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_769x769_80k_cityscapes/fcn_d6_r101b_d16_769x769_80k_cityscapes-20210311_154527.log.json)     |
+| Method     | Backbone   | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device   |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                   |
+| ---------- | ---------- | --------- | ------: | -------- | -------------- | -------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | R-50-D8    | 512x1024  |   40000 | 5.7      | 4.17           | V100     | 72.25 |         73.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608.log.json)                         |
+| FCN        | R-101-D8   | 512x1024  |   40000 | 9.2      | 2.66           | V100     | 75.45 |         76.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852-a883d3a1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852.log.json)                     |
+| FCN        | R-50-D8    | 769x769   |   40000 | 6.5      | 1.80           | V100     | 71.47 |         72.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104-977b5d02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104.log.json)                             |
+| FCN        | R-101-D8   | 769x769   |   40000 | 10.4     | 1.19           | V100     | 73.93 |         75.14 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208-7d4ab69c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208.log.json)                         |
+| FCN        | R-18-D8    | 512x1024  |   80000 | 1.7      | 14.65          | V100     | 71.11 |         72.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes_20201225_021327-6c50f8b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes-20201225_021327.log.json)                         |
+| FCN        | R-50-D8    | 512x1024  |   80000 | -        |                | V100     | 73.61 |         74.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019-03aa804d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019.log.json)                         |
+| FCN        | R-101-D8   | 512x1024  |   80000 | -        | -              | V100     | 75.13 |         75.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038-3fb937eb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038.log.json)                     |
+| FCN (FP16) | R-101-D8   | 512x1024  |   80000 | 5.37     | 8.64           | V100     | 76.80 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921-fb13e883.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921.log.json) |
+| FCN        | R-18-D8    | 769x769   |   80000 | 1.9      | 6.40           | V100     | 70.80 |         73.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes_20201225_021451-9739d1b8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes-20201225_021451.log.json)                             |
+| FCN        | R-50-D8    | 769x769   |   80000 | -        | -              | V100     | 72.64 |         73.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749-f5caeabc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749.log.json)                             |
+| FCN        | R-101-D8   | 769x769   |   80000 | -        | -              | V100     | 75.52 |         76.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354-45cbac68.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354.log.json)                         |
+| FCN        | R-18b-D8   | 512x1024  |   80000 | 1.6      | 16.74          | V100     | 70.24 |         72.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes_20201225_230143-92c0f445.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes-20201225_230143.log.json)                     |
+| FCN        | R-50b-D8   | 512x1024  |   80000 | 5.6      | 4.20           | V100     | 75.65 |         77.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes_20201225_094221-82957416.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes-20201225_094221.log.json)                     |
+| FCN        | R-101b-D8  | 512x1024  |   80000 | 9.1      | 2.73           | V100     | 77.37 |         78.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes_20201226_160213-4543858f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes-20201226_160213.log.json)                 |
+| FCN        | R-18b-D8   | 769x769   |   80000 | 1.7      | 6.70           | V100     | 69.66 |         72.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes_20201226_004430-32d504e5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes-20201226_004430.log.json)                         |
+| FCN        | R-50b-D8   | 769x769   |   80000 | 6.3      | 1.82           | V100     | 73.83 |         76.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes_20201225_094223-94552d38.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes-20201225_094223.log.json)                         |
+| FCN        | R-101b-D8  | 769x769   |   80000 | 10.3     | 1.15           | V100     | 77.02 |         78.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes_20201226_170012-82be37e2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes-20201226_170012.log.json)                     |
+| FCN (D6)   | R-50-D16   | 512x1024  |   40000 | 3.4      | 10.22          | TITAN Xp | 77.06 |         78.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes_20210305_130133-98d5d1bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes-20210305_130133.log.json)         |
+| FCN (D6)   | R-50-D16   | 512x1024  |   80000 | -        | 10.35          | TITAN Xp | 77.27 |         78.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes_20210306_115604-133c292f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes-20210306_115604.log.json)         |
+| FCN (D6)   | R-50-D16   | 769x769   |   40000 | 3.7      | 4.17           | TITAN Xp | 76.82 |         78.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes_20210305_185744-1aab18ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes-20210305_185744.log.json)             |
+| FCN (D6)   | R-50-D16   | 769x769   |   80000 | -        | 4.15           | TITAN Xp | 77.04 |         78.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes_20210305_200413-109d88eb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes-20210305_200413.log.json)             |
+| FCN (D6)   | R-101-D16  | 512x1024  |   40000 | 4.5      | 8.04           | TITAN Xp | 77.36 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes_20210305_130337-9cf2b450.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes-20210305_130337.log.json)     |
+| FCN (D6)   | R-101-D16  | 512x1024  |   80000 | -        | 8.26           | TITAN Xp | 78.46 |         80.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes_20210308_102747-cb336445.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes-20210308_102747.log.json)     |
+| FCN (D6)   | R-101-D16  | 769x769   |   40000 | 5.0      | 3.12           | TITAN Xp | 77.28 |         78.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes_20210308_102453-60b114e9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes-20210308_102453.log.json)         |
+| FCN (D6)   | R-101-D16  | 769x769   |   80000 | -        | 3.21           | TITAN Xp | 78.06 |         79.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes_20210306_120016-e33adc4f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes-20210306_120016.log.json)         |
+| FCN (D6)   | R-50b-D16  | 512x1024  |   80000 | 3.2      | 10.16          | TITAN Xp | 76.99 |         79.03 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes/fcn_d6_r50b-d16_512x1024_80k_cityscapes_20210311_125550-6a0b62e9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_512x1024_80k_cityscapes/fcn_d6_r50b_d16_512x1024_80k_cityscapes-20210311_125550.log.json)     |
+| FCN (D6)   | R-50b-D16  | 769x769   |   80000 | 3.6      | 4.17           | TITAN Xp | 76.86 |         78.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes/fcn_d6_r50b-d16_769x769_80k_cityscapes_20210311_131012-d665f231.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_769x769_80k_cityscapes/fcn_d6_r50b_d16_769x769_80k_cityscapes-20210311_131012.log.json)         |
+| FCN (D6)   | R-101b-D16 | 512x1024  |   80000 | 4.3      | 8.46           | TITAN Xp | 77.72 |         79.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes/fcn_d6_r101b-d16_512x1024_80k_cityscapes_20210311_144305-3f2eb5b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_512x1024_80k_cityscapes/fcn_d6_r101b_d16_512x1024_80k_cityscapes-20210311_144305.log.json) |
+| FCN (D6)   | R-101b-D16 | 769x769   |   80000 | 4.8      | 3.32           | TITAN Xp | 77.34 |         78.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes/fcn_d6_r101b-d16_769x769_80k_cityscapes_20210311_154527-c4d8bfbc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_769x769_80k_cityscapes/fcn_d6_r101b_d16_769x769_80k_cityscapes-20210311_154527.log.json)     |
 
 ### ADE20K
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                            | download                                                                                                                                                                                                                                                                                                               |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN    | R-50-D8  | 512x512   |   80000 | 8.5      | 23.49          | 35.94 |         37.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016.log.json)         |
-| FCN    | R-101-D8 | 512x512   |   80000 | 12       | 14.78          | 39.61 |         40.83 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143-bc1809f7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143.log.json)     |
-| FCN    | R-50-D8  | 512x512   |  160000 | -        | -              | 36.10 |         38.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713-4edbc3b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713.log.json)     |
-| FCN    | R-101-D8 | 512x512   |  160000 | -        | -              | 39.91 |         41.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816-fd192bd5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-50-D8  | 512x512   |   80000 | 8.5      | 23.49          | V100   | 35.94 |         37.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016.log.json)         |
+| FCN    | R-101-D8 | 512x512   |   80000 | 12       | 14.78          | V100   | 39.61 |         40.83 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.pyy) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143-bc1809f7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143.log.json)     |
+| FCN    | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 36.10 |         38.08 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713-4edbc3b4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713.log.json)     |
+| FCN    | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 39.91 |         41.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816-fd192bd5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816.log.json) |
 
 ### Pascal VOC 2012 + Aug
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                             | download                                                                                                                                                                                                                                                                                                                   |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN    | R-50-D8  | 512x512   |   20000 | 5.7      | 23.28          | 67.08 |         69.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715-52dc5306.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715.log.json)     |
-| FCN    | R-101-D8 | 512x512   |   20000 | 9.2      | 14.81          | 71.16 |         73.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842-0bb4e798.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842.log.json) |
-| FCN    | R-50-D8  | 512x512   |   40000 | -        | -              | 66.97 |         69.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222-5e2dbf40.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json)     |
-| FCN    | R-101-D8 | 512x512   |   40000 | -        | -              | 69.91 |         72.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240-4c8bcefd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                   |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-50-D8  | 512x512   |   20000 | 5.7      | 23.28          | V100   | 67.08 |         69.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715-52dc5306.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715.log.json)     |
+| FCN    | R-101-D8 | 512x512   |   20000 | 9.2      | 14.81          | V100   | 71.16 |         73.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842-0bb4e798.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842.log.json) |
+| FCN    | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 66.97 |         69.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222-5e2dbf40.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json)     |
+| FCN    | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 69.91 |         72.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240-4c8bcefd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240.log.json) |
 
 ### Pascal Context
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN    | R-101-D8 | 480x480   |   40000 | -        | 9.93           | 44.43 |         45.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_480x480_40k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context_20210421_154757-b5e97937.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context-20210421_154757.log.json) |
-| FCN    | R-101-D8 | 480x480   |   80000 | -        | -              | 44.13 |         45.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_480x480_80k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context_20210421_163310-4711813f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context-20210421_163310.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-101-D8 | 480x480   |   40000 | -        | 9.93           | V100   | 44.43 |         45.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context_20210421_154757-b5e97937.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context-20210421_154757.log.json) |
+| FCN    | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 44.13 |         45.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context_20210421_163310-4711813f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context-20210421_163310.log.json) |
 
 ### Pascal Context 59
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                       |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN    | R-101-D8 | 480x480   |   40000 | -        | -              | 48.42 |          50.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_480x480_40k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59_20210415_230724-8cf83682.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59-20210415_230724.log.json) |
-| FCN    | R-101-D8 | 480x480   |   80000 | -        | -              | 49.35 |         51.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn/fcn_r101-d8_480x480_80k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59_20210416_110804-9a6f2c94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59-20210416_110804.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                       |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-101-D8 | 480x480   |   40000 | -        | -              | V100   | 48.42 |          50.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59_20210415_230724-8cf83682.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59-20210415_230724.log.json) |
+| FCN    | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 49.35 |         51.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59_20210416_110804-9a6f2c94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59-20210416_110804.log.json) |
 
 Note:
 
 - `FP16` means Mixed Precision (FP16) is adopted in training.
 - `FCN D6` means dilation rate of convolution operator in FCN is 6.
+
+## Citation
+
+```bibtex
+@article{shelhamer2017fully,
+  title={Fully convolutional networks for semantic segmentation},
+  author={Shelhamer, Evan and Long, Jonathan and Darrell, Trevor},
+  journal={IEEE transactions on pattern analysis and machine intelligence},
+  volume={39},
+  number={4},
+  pages={640--651},
+  year={2017},
+  publisher={IEEE Trans Pattern Anal Mach Intell}
+}
+```
diff --git a/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py b/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..8f2cd02b00
--- /dev/null
+++ b/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py b/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..4782b30377
--- /dev/null
+++ b/configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py b/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..5f654b4bbd
--- /dev/null
+++ b/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py b/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..91eca1c52e
--- /dev/null
+++ b/configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py b/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..62e6127799
--- /dev/null
+++ b/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py b/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..1b8d24799e
--- /dev/null
+++ b/configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes.py b/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes.py
rename to configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes.py b/configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes.py
rename to configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes.py b/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes.py
rename to configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes.py b/configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes.py
rename to configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py b/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..7d470a50be
--- /dev/null
+++ b/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py b/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..e9093ea2dc
--- /dev/null
+++ b/configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/fcn/fcn.yml b/configs/fcn/fcn.yml
deleted file mode 100644
index 563391c93f..0000000000
--- a/configs/fcn/fcn.yml
+++ /dev/null
@@ -1,827 +0,0 @@
-Collections:
-- Name: FCN
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-    - Pascal Context
-    - Pascal Context 59
-  Paper:
-    URL: https://arxiv.org/abs/1411.4038
-    Title: Fully Convolutional Networks for Semantic Segmentation
-  README: configs/fcn/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/BVLC/caffe/wiki/Model-Zoo#fcn
-Models:
-- Name: fcn_r50-d8_512x1024_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 239.81
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 72.25
-      mIoU(ms+flip): 73.36
-  Config: configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth
-- Name: fcn_r101-d8_512x1024_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 375.94
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.45
-      mIoU(ms+flip): 76.58
-  Config: configs/fcn/fcn_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852-a883d3a1.pth
-- Name: fcn_r50-d8_769x769_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 555.56
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 71.47
-      mIoU(ms+flip): 72.54
-  Config: configs/fcn/fcn_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104-977b5d02.pth
-- Name: fcn_r101-d8_769x769_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 840.34
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.93
-      mIoU(ms+flip): 75.14
-  Config: configs/fcn/fcn_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208-7d4ab69c.pth
-- Name: fcn_r18-d8_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 68.26
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 1.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 71.11
-      mIoU(ms+flip): 72.91
-  Config: configs/fcn/fcn_r18-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes_20201225_021327-6c50f8b4.pth
-- Name: fcn_r50-d8_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.61
-      mIoU(ms+flip): 74.24
-  Config: configs/fcn/fcn_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019-03aa804d.pth
-- Name: fcn_r101-d8_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.13
-      mIoU(ms+flip): 75.94
-  Config: configs/fcn/fcn_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038-3fb937eb.pth
-- Name: fcn_r101-d8_fp16_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 115.74
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (512,1024)
-    Training Memory (GB): 5.37
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.8
-  Config: configs/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921-fb13e883.pth
-- Name: fcn_r18-d8_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-18-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 156.25
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 1.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 70.8
-      mIoU(ms+flip): 73.16
-  Config: configs/fcn/fcn_r18-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes_20201225_021451-9739d1b8.pth
-- Name: fcn_r50-d8_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 72.64
-      mIoU(ms+flip): 73.32
-  Config: configs/fcn/fcn_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749-f5caeabc.pth
-- Name: fcn_r101-d8_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.52
-      mIoU(ms+flip): 76.61
-  Config: configs/fcn/fcn_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354-45cbac68.pth
-- Name: fcn_r18b-d8_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-18b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 59.74
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 1.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 70.24
-      mIoU(ms+flip): 72.77
-  Config: configs/fcn/fcn_r18b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes_20201225_230143-92c0f445.pth
-- Name: fcn_r50b-d8_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 238.1
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.65
-      mIoU(ms+flip): 77.59
-  Config: configs/fcn/fcn_r50b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes_20201225_094221-82957416.pth
-- Name: fcn_r101b-d8_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 366.3
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.37
-      mIoU(ms+flip): 78.77
-  Config: configs/fcn/fcn_r101b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes_20201226_160213-4543858f.pth
-- Name: fcn_r18b-d8_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-18b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 149.25
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 1.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 69.66
-      mIoU(ms+flip): 72.07
-  Config: configs/fcn/fcn_r18b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes_20201226_004430-32d504e5.pth
-- Name: fcn_r50b-d8_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 549.45
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.83
-      mIoU(ms+flip): 76.6
-  Config: configs/fcn/fcn_r50b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes_20201225_094223-94552d38.pth
-- Name: fcn_r101b-d8_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 869.57
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.02
-      mIoU(ms+flip): 78.67
-  Config: configs/fcn/fcn_r101b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes_20201226_170012-82be37e2.pth
-- Name: fcn_d6_r50-d16_512x1024_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D16
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 97.85
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.06
-      mIoU(ms+flip): 78.85
-  Config: configs/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes_20210305_130133-98d5d1bc.pth
-- Name: fcn_d6_r50-d16_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D16
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 96.62
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.27
-      mIoU(ms+flip): 78.88
-  Config: configs/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes_20210306_115604-133c292f.pth
-- Name: fcn_d6_r50-d16_769x769_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D16
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 239.81
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 3.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.82
-      mIoU(ms+flip): 78.22
-  Config: configs/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes_20210305_185744-1aab18ed.pth
-- Name: fcn_d6_r50-d16_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D16
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 240.96
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.04
-      mIoU(ms+flip): 78.4
-  Config: configs/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes_20210305_200413-109d88eb.pth
-- Name: fcn_d6_r101-d16_512x1024_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D16
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 124.38
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 4.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.36
-      mIoU(ms+flip): 79.18
-  Config: configs/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes_20210305_130337-9cf2b450.pth
-- Name: fcn_d6_r101-d16_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D16
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 121.07
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.46
-      mIoU(ms+flip): 80.42
-  Config: configs/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes_20210308_102747-cb336445.pth
-- Name: fcn_d6_r101-d16_769x769_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D16
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 320.51
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 5.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.28
-      mIoU(ms+flip): 78.95
-  Config: configs/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes_20210308_102453-60b114e9.pth
-- Name: fcn_d6_r101-d16_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D16
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 311.53
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.06
-      mIoU(ms+flip): 79.58
-  Config: configs/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes_20210306_120016-e33adc4f.pth
-- Name: fcn_d6_r50b-d16_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50b-D16
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 98.43
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.99
-      mIoU(ms+flip): 79.03
-  Config: configs/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes/fcn_d6_r50b-d16_512x1024_80k_cityscapes_20210311_125550-6a0b62e9.pth
-- Name: fcn_d6_r50b-d16_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-50b-D16
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 239.81
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 3.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.86
-      mIoU(ms+flip): 78.52
-  Config: configs/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes/fcn_d6_r50b-d16_769x769_80k_cityscapes_20210311_131012-d665f231.pth
-- Name: fcn_d6_r101b-d16_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101b-D16
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 118.2
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 4.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.72
-      mIoU(ms+flip): 79.53
-  Config: configs/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes/fcn_d6_r101b-d16_512x1024_80k_cityscapes_20210311_144305-3f2eb5b4.pth
-- Name: fcn_d6_r101b-d16_769x769_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: R-101b-D16
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 301.2
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 4.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.34
-      mIoU(ms+flip): 78.91
-  Config: configs/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes/fcn_d6_r101b-d16_769x769_80k_cityscapes_20210311_154527-c4d8bfbc.pth
-- Name: fcn_r50-d8_512x512_80k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 42.57
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 35.94
-      mIoU(ms+flip): 37.94
-  Config: configs/fcn/fcn_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth
-- Name: fcn_r101-d8_512x512_80k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 67.66
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 39.61
-      mIoU(ms+flip): 40.83
-  Config: configs/fcn/fcn_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143-bc1809f7.pth
-- Name: fcn_r50-d8_512x512_160k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 36.1
-      mIoU(ms+flip): 38.08
-  Config: configs/fcn/fcn_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713-4edbc3b4.pth
-- Name: fcn_r101-d8_512x512_160k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 39.91
-      mIoU(ms+flip): 41.4
-  Config: configs/fcn/fcn_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816-fd192bd5.pth
-- Name: fcn_r50-d8_512x512_20k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 42.96
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 67.08
-      mIoU(ms+flip): 69.94
-  Config: configs/fcn/fcn_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715-52dc5306.pth
-- Name: fcn_r101-d8_512x512_20k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 67.52
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 71.16
-      mIoU(ms+flip): 73.57
-  Config: configs/fcn/fcn_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842-0bb4e798.pth
-- Name: fcn_r50-d8_512x512_40k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 66.97
-      mIoU(ms+flip): 69.04
-  Config: configs/fcn/fcn_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222-5e2dbf40.pth
-- Name: fcn_r101-d8_512x512_40k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 69.91
-      mIoU(ms+flip): 72.38
-  Config: configs/fcn/fcn_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240-4c8bcefd.pth
-- Name: fcn_r101-d8_480x480_40k_pascal_context
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 100.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (480,480)
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 44.43
-      mIoU(ms+flip): 45.63
-  Config: configs/fcn/fcn_r101-d8_480x480_40k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context_20210421_154757-b5e97937.pth
-- Name: fcn_r101-d8_480x480_80k_pascal_context
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 44.13
-      mIoU(ms+flip): 45.26
-  Config: configs/fcn/fcn_r101-d8_480x480_80k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context_20210421_163310-4711813f.pth
-- Name: fcn_r101-d8_480x480_40k_pascal_context_59
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 48.42
-      mIoU(ms+flip): 50.4
-  Config: configs/fcn/fcn_r101-d8_480x480_40k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59_20210415_230724-8cf83682.pth
-- Name: fcn_r101-d8_480x480_80k_pascal_context_59
-  In Collection: FCN
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 49.35
-      mIoU(ms+flip): 51.38
-  Config: configs/fcn/fcn_r101-d8_480x480_80k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59_20210416_110804-9a6f2c94.pth
diff --git a/configs/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes.py b/configs/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes.py
deleted file mode 100644
index aec4254c8f..0000000000
--- a/configs/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_d6_r50-d16_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes.py b/configs/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes.py
deleted file mode 100644
index d0bafc52ab..0000000000
--- a/configs/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_d6_r50-d16_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes.py b/configs/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes.py
deleted file mode 100644
index 29a9f98a93..0000000000
--- a/configs/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_d6_r50-d16_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes.py b/configs/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes.py
deleted file mode 100644
index 1f21c6578b..0000000000
--- a/configs/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_d6_r50-d16_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes.py b/configs/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes.py
deleted file mode 100644
index af3f765b76..0000000000
--- a/configs/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './fcn_d6_r50b-d16_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes.py b/configs/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes.py
deleted file mode 100644
index e3d4d884fd..0000000000
--- a/configs/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './fcn_d6_r50b-d16_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes.py b/configs/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes.py
deleted file mode 100644
index 0749ff14a3..0000000000
--- a/configs/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_d6_r50-d16_512x1024_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes.py b/configs/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes.py
deleted file mode 100644
index fba8948a03..0000000000
--- a/configs/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_d6_r50-d16_769x769_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/fcn/fcn_r101-d8_480x480_40k_pascal_context.py b/configs/fcn/fcn_r101-d8_480x480_40k_pascal_context.py
deleted file mode 100644
index f3a15b4105..0000000000
--- a/configs/fcn/fcn_r101-d8_480x480_40k_pascal_context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_480x480_40k_pascal_context.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_480x480_40k_pascal_context_59.py b/configs/fcn/fcn_r101-d8_480x480_40k_pascal_context_59.py
deleted file mode 100644
index 908f4bff00..0000000000
--- a/configs/fcn/fcn_r101-d8_480x480_40k_pascal_context_59.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_480x480_40k_pascal_context_59.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_480x480_80k_pascal_context.py b/configs/fcn/fcn_r101-d8_480x480_80k_pascal_context.py
deleted file mode 100644
index bdccfd99ba..0000000000
--- a/configs/fcn/fcn_r101-d8_480x480_80k_pascal_context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_480x480_80k_pascal_context.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_480x480_80k_pascal_context_59.py b/configs/fcn/fcn_r101-d8_480x480_80k_pascal_context_59.py
deleted file mode 100644
index 09cb612e42..0000000000
--- a/configs/fcn/fcn_r101-d8_480x480_80k_pascal_context_59.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_480x480_80k_pascal_context_59.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..b3ec0a742c
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..1f83fe2078
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..4527b3b8a0
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..6ce112484d
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py b/configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..b4d94878c8
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+_base_ = './fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
+    loss_scale=512.)
diff --git a/configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..b1f5c5c785
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..61ee96f94e
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py b/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000..1161193adb
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py b/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..f3a6dbc9ab
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..b68b6e0407
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..3facce30dc
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py b/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000..1161193adb
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py b/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..cebe33082a
--- /dev/null
+++ b/configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_512x1024_40k_cityscapes.py b/configs/fcn/fcn_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index 7918dd10d0..0000000000
--- a/configs/fcn/fcn_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_512x1024_80k_cityscapes.py b/configs/fcn/fcn_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 528110dc73..0000000000
--- a/configs/fcn/fcn_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_512x512_160k_ade20k.py b/configs/fcn/fcn_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 1bf6780f2c..0000000000
--- a/configs/fcn/fcn_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_512x512_20k_voc12aug.py b/configs/fcn/fcn_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 09a5fe5468..0000000000
--- a/configs/fcn/fcn_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_512x512_40k_voc12aug.py b/configs/fcn/fcn_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index eafefaa675..0000000000
--- a/configs/fcn/fcn_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_512x512_80k_ade20k.py b/configs/fcn/fcn_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 6d0294530f..0000000000
--- a/configs/fcn/fcn_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_769x769_40k_cityscapes.py b/configs/fcn/fcn_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 6b4cc57129..0000000000
--- a/configs/fcn/fcn_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_769x769_80k_cityscapes.py b/configs/fcn/fcn_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 3503c76935..0000000000
--- a/configs/fcn/fcn_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes.py b/configs/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes.py
deleted file mode 100644
index da27a90268..0000000000
--- a/configs/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './fcn_r101-d8_512x1024_80k_cityscapes.py'
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
-    loss_scale=512.)
diff --git a/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..e53751b144
--- /dev/null
+++ b/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py b/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..daa6502610
--- /dev/null
+++ b/configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/fcn/fcn_r101b-d8_512x1024_80k_cityscapes.py b/configs/fcn/fcn_r101b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 1b9bf60fc1..0000000000
--- a/configs/fcn/fcn_r101b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './fcn_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/fcn/fcn_r101b-d8_769x769_80k_cityscapes.py b/configs/fcn/fcn_r101b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index f36eb02e68..0000000000
--- a/configs/fcn/fcn_r101b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './fcn_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py b/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..4073148122
--- /dev/null
+++ b/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py b/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..2c1d2b6df0
--- /dev/null
+++ b/configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/fcn/fcn_r18-d8_512x1024_80k_cityscapes.py b/configs/fcn/fcn_r18-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 5a1d29e480..0000000000
--- a/configs/fcn/fcn_r18-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/fcn/fcn_r18-d8_769x769_80k_cityscapes.py b/configs/fcn/fcn_r18-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 6644a58dea..0000000000
--- a/configs/fcn/fcn_r18-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..08ab467573
--- /dev/null
+++ b/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py b/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..c591ebe972
--- /dev/null
+++ b/configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/fcn/fcn_r18b-d8_512x1024_80k_cityscapes.py b/configs/fcn/fcn_r18b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 92accfc703..0000000000
--- a/configs/fcn/fcn_r18b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet18',
-    backbone=dict(type='ResNet', depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/fcn/fcn_r18b-d8_769x769_80k_cityscapes.py b/configs/fcn/fcn_r18b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 5dd34dd213..0000000000
--- a/configs/fcn/fcn_r18b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet18',
-    backbone=dict(type='ResNet', depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py b/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py
rename to configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/fcn/fcn_r50-d8_769x769_40k_cityscapes.py b/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_769x769_40k_cityscapes.py
rename to configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/fcn/fcn_r50-d8_512x1024_80k_cityscapes.py b/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_512x1024_80k_cityscapes.py
rename to configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/fcn/fcn_r50-d8_769x769_80k_cityscapes.py b/configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_769x769_80k_cityscapes.py
rename to configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/fcn/fcn_r50-d8_512x512_160k_ade20k.py b/configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_512x512_160k_ade20k.py
rename to configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/fcn/fcn_r50-d8_512x512_20k_voc12aug.py b/configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_512x512_20k_voc12aug.py
rename to configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/fcn/fcn_r50-d8_480x480_40k_pascal_context.py b/configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-480x480.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_480x480_40k_pascal_context.py
rename to configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-480x480.py
diff --git a/configs/fcn/fcn_r50-d8_480x480_40k_pascal_context_59.py b/configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_480x480_40k_pascal_context_59.py
rename to configs/fcn/fcn_r50-d8_4xb4-40k_pascal-context-59-480x480.py
diff --git a/configs/fcn/fcn_r50-d8_512x512_40k_voc12aug.py b/configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_512x512_40k_voc12aug.py
rename to configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/fcn/fcn_r50-d8_512x512_80k_ade20k.py b/configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_512x512_80k_ade20k.py
rename to configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/fcn/fcn_r50-d8_480x480_80k_pascal_context.py b/configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-480x480.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_480x480_80k_pascal_context.py
rename to configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-480x480.py
diff --git a/configs/fcn/fcn_r50-d8_480x480_80k_pascal_context_59.py b/configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/fcn/fcn_r50-d8_480x480_80k_pascal_context_59.py
rename to configs/fcn/fcn_r50-d8_4xb4-80k_pascal-context-59-480x480.py
diff --git a/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..44821fd7d3
--- /dev/null
+++ b/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py b/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..a85b39197e
--- /dev/null
+++ b/configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './fcn_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/fcn/fcn_r50b-d8_512x1024_80k_cityscapes.py b/configs/fcn/fcn_r50b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 28ef13f8d1..0000000000
--- a/configs/fcn/fcn_r50b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/fcn/fcn_r50b-d8_769x769_80k_cityscapes.py b/configs/fcn/fcn_r50b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 106f7b6a1e..0000000000
--- a/configs/fcn/fcn_r50b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fcn_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/fcn/metafile.yaml b/configs/fcn/metafile.yaml
new file mode 100644
index 0000000000..f3d80f652e
--- /dev/null
+++ b/configs/fcn/metafile.yaml
@@ -0,0 +1,997 @@
+Collections:
+- Name: FCN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+    - Pascal Context
+    - Pascal Context 59
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  README: configs/fcn/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: fcn_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.25
+      mIoU(ms+flip): 73.36
+  Config: configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.45
+      mIoU(ms+flip): 76.58
+  Config: configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852-a883d3a1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_40k_cityscapes/fcn_r101-d8_512x1024_40k_cityscapes_20200604_181852.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.47
+      mIoU(ms+flip): 72.54
+  Config: configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104-977b5d02.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_40k_cityscapes/fcn_r50-d8_769x769_40k_cityscapes_20200606_113104.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.93
+      mIoU(ms+flip): 75.14
+  Config: configs/fcn/fcn_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208-7d4ab69c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_40k_cityscapes/fcn_r101-d8_769x769_40k_cityscapes_20200606_113208.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r18-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.11
+      mIoU(ms+flip): 72.91
+  Config: configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes_20201225_021327-6c50f8b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_512x1024_80k_cityscapes/fcn_r18-d8_512x1024_80k_cityscapes-20201225_021327.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.61
+      mIoU(ms+flip): 74.24
+  Config: configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019-03aa804d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_80k_cityscapes/fcn_r50-d8_512x1024_80k_cityscapes_20200606_113019.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.13
+      mIoU(ms+flip): 75.94
+  Config: configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038-3fb937eb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x1024_80k_cityscapes/fcn_r101-d8_512x1024_80k_cityscapes_20200606_113038.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.8
+  Config: configs/fcn/fcn_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    - (FP16)
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.37
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921-fb13e883.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_fp16_512x1024_80k_cityscapes/fcn_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230921.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r18-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.8
+      mIoU(ms+flip): 73.16
+  Config: configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes_20201225_021451-9739d1b8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18-d8_769x769_80k_cityscapes/fcn_r18-d8_769x769_80k_cityscapes-20201225_021451.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.64
+      mIoU(ms+flip): 73.32
+  Config: configs/fcn/fcn_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749-f5caeabc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_769x769_80k_cityscapes/fcn_r50-d8_769x769_80k_cityscapes_20200606_195749.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.52
+      mIoU(ms+flip): 76.61
+  Config: configs/fcn/fcn_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354-45cbac68.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_769x769_80k_cityscapes/fcn_r101-d8_769x769_80k_cityscapes_20200606_214354.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r18b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.24
+      mIoU(ms+flip): 72.77
+  Config: configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes_20201225_230143-92c0f445.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_512x1024_80k_cityscapes/fcn_r18b-d8_512x1024_80k_cityscapes-20201225_230143.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.65
+      mIoU(ms+flip): 77.59
+  Config: configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes_20201225_094221-82957416.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_512x1024_80k_cityscapes/fcn_r50b-d8_512x1024_80k_cityscapes-20201225_094221.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.37
+      mIoU(ms+flip): 78.77
+  Config: configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes_20201226_160213-4543858f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_512x1024_80k_cityscapes/fcn_r101b-d8_512x1024_80k_cityscapes-20201226_160213.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r18b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 69.66
+      mIoU(ms+flip): 72.07
+  Config: configs/fcn/fcn_r18b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes_20201226_004430-32d504e5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r18b-d8_769x769_80k_cityscapes/fcn_r18b-d8_769x769_80k_cityscapes-20201226_004430.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.83
+      mIoU(ms+flip): 76.6
+  Config: configs/fcn/fcn_r50b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes_20201225_094223-94552d38.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50b-d8_769x769_80k_cityscapes/fcn_r50b-d8_769x769_80k_cityscapes-20201225_094223.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.02
+      mIoU(ms+flip): 78.67
+  Config: configs/fcn/fcn_r101b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes_20201226_170012-82be37e2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101b-d8_769x769_80k_cityscapes/fcn_r101b-d8_769x769_80k_cityscapes-20201226_170012.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.06
+      mIoU(ms+flip): 78.85
+  Config: configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 3.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes_20210305_130133-98d5d1bc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_40k_cityscapes/fcn_d6_r50-d16_512x1024_40k_cityscapes-20210305_130133.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.27
+      mIoU(ms+flip): 78.88
+  Config: configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes_20210306_115604-133c292f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_512x1024_80k_cityscapes/fcn_d6_r50-d16_512x1024_80k_cityscapes-20210306_115604.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.82
+      mIoU(ms+flip): 78.22
+  Config: configs/fcn/fcn-d6_r50-d16_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 3.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes_20210305_185744-1aab18ed.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_40k_cityscapes/fcn_d6_r50-d16_769x769_40k_cityscapes-20210305_185744.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.04
+      mIoU(ms+flip): 78.4
+  Config: configs/fcn/fcn-d6_r50-d16_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes_20210305_200413-109d88eb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50-d16_769x769_80k_cityscapes/fcn_d6_r50-d16_769x769_80k_cityscapes-20210305_200413.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.36
+      mIoU(ms+flip): 79.18
+  Config: configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 4.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes_20210305_130337-9cf2b450.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_40k_cityscapes/fcn_d6_r101-d16_512x1024_40k_cityscapes-20210305_130337.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.46
+      mIoU(ms+flip): 80.42
+  Config: configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes_20210308_102747-cb336445.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_512x1024_80k_cityscapes/fcn_d6_r101-d16_512x1024_80k_cityscapes-20210308_102747.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.28
+      mIoU(ms+flip): 78.95
+  Config: configs/fcn/fcn-d6_r101-d16_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 5.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes_20210308_102453-60b114e9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_40k_cityscapes/fcn_d6_r101-d16_769x769_40k_cityscapes-20210308_102453.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.06
+      mIoU(ms+flip): 79.58
+  Config: configs/fcn/fcn-d6_r101-d16_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes_20210306_120016-e33adc4f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101-d16_769x769_80k_cityscapes/fcn_d6_r101-d16_769x769_80k_cityscapes-20210306_120016.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.99
+      mIoU(ms+flip): 79.03
+  Config: configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 3.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_512x1024_80k_cityscapes/fcn_d6_r50b-d16_512x1024_80k_cityscapes_20210311_125550-6a0b62e9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_512x1024_80k_cityscapes/fcn_d6_r50b_d16_512x1024_80k_cityscapes-20210311_125550.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.86
+      mIoU(ms+flip): 78.52
+  Config: configs/fcn/fcn-d6_r50b-d16_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 3.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b-d16_769x769_80k_cityscapes/fcn_d6_r50b-d16_769x769_80k_cityscapes_20210311_131012-d665f231.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r50b_d16_769x769_80k_cityscapes/fcn_d6_r50b_d16_769x769_80k_cityscapes-20210311_131012.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.72
+      mIoU(ms+flip): 79.53
+  Config: configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 4.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_512x1024_80k_cityscapes/fcn_d6_r101b-d16_512x1024_80k_cityscapes_20210311_144305-3f2eb5b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_512x1024_80k_cityscapes/fcn_d6_r101b_d16_512x1024_80k_cityscapes-20210311_144305.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.34
+      mIoU(ms+flip): 78.91
+  Config: configs/fcn/fcn-d6_r101b-d16_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D16
+    - FCN
+    - (D6)
+    Training Resources: 4x TITAN Xp GPUS
+    Memory (GB): 4.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b-d16_769x769_80k_cityscapes/fcn_d6_r101b-d16_769x769_80k_cityscapes_20210311_154527-c4d8bfbc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_d6_r101b_d16_769x769_80k_cityscapes/fcn_d6_r101b_d16_769x769_80k_cityscapes-20210311_154527.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 35.94
+      mIoU(ms+flip): 37.94
+  Config: configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_80k_ade20k/fcn_r50-d8_512x512_80k_ade20k_20200614_144016.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.61
+      mIoU(ms+flip): 40.83
+  Config: configs/fcn/fcn_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143-bc1809f7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_80k_ade20k/fcn_r101-d8_512x512_80k_ade20k_20200615_014143.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 36.1
+      mIoU(ms+flip): 38.08
+  Config: configs/fcn/fcn_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713-4edbc3b4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_160k_ade20k/fcn_r50-d8_512x512_160k_ade20k_20200615_100713.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.91
+      mIoU(ms+flip): 41.4
+  Config: configs/fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816-fd192bd5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_160k_ade20k/fcn_r101-d8_512x512_160k_ade20k_20200615_105816.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 67.08
+      mIoU(ms+flip): 69.94
+  Config: configs/fcn/fcn_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715-52dc5306.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_20k_voc12aug/fcn_r50-d8_512x512_20k_voc12aug_20200617_010715.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 71.16
+      mIoU(ms+flip): 73.57
+  Config: configs/fcn/fcn_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842-0bb4e798.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_20k_voc12aug/fcn_r101-d8_512x512_20k_voc12aug_20200617_010842.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 66.97
+      mIoU(ms+flip): 69.04
+  Config: configs/fcn/fcn_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222-5e2dbf40.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x512_40k_voc12aug/fcn_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 69.91
+      mIoU(ms+flip): 72.38
+  Config: configs/fcn/fcn_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240-4c8bcefd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_512x512_40k_voc12aug/fcn_r101-d8_512x512_40k_voc12aug_20200613_161240.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-40k_pascal-context-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 44.43
+      mIoU(ms+flip): 45.63
+  Config: configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context_20210421_154757-b5e97937.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context/fcn_r101-d8_480x480_40k_pascal_context-20210421_154757.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-80k_pascal-context-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 44.13
+      mIoU(ms+flip): 45.26
+  Config: configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context_20210421_163310-4711813f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context/fcn_r101-d8_480x480_80k_pascal_context-20210421_163310.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-40k_pascal-context-59-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 48.42
+      mIoU(ms+flip): 50.4
+  Config: configs/fcn/fcn_r101-d8_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59_20210415_230724-8cf83682.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_40k_pascal_context_59/fcn_r101-d8_480x480_40k_pascal_context_59-20210415_230724.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
+- Name: fcn_r101-d8_4xb4-80k_pascal-context-59-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 49.35
+      mIoU(ms+flip): 51.38
+  Config: configs/fcn/fcn_r101-d8_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59_20210416_110804-9a6f2c94.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r101-d8_480x480_80k_pascal_context_59/fcn_r101-d8_480x480_80k_pascal_context_59-20210416_110804.log.json
+  Paper:
+    Title: Fully Convolutional Networks for Semantic Segmentation
+    URL: https://arxiv.org/abs/1411.4038
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fcn_head.py#L11
+  Framework: PyTorch
diff --git a/configs/gcnet/README.md b/configs/gcnet/README.md
index 9a4cf7a606..ba1a21e851 100644
--- a/configs/gcnet/README.md
+++ b/configs/gcnet/README.md
@@ -1,6 +1,6 @@
 # GCNet
 
-[GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond](https://arxiv.org/abs/1904.11492)
+> [GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond](https://arxiv.org/abs/1904.11492)
 
 ## Introduction
 
@@ -22,6 +22,39 @@ The Non-Local Network (NLNet) presents a pioneering approach for capturing long-
 <img src="https://user-images.githubusercontent.com/24582831/142901601-ad17922e-2538-4b48-9f51-84a57d44b12b.png" width="80%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| GCNet  | R-50-D8  | 512x1024  |   40000 | 5.8      | 3.93           | V100   | 77.69 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436-4b0fd17b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436.log.json)     |
+| GCNet  | R-101-D8 | 512x1024  |   40000 | 9.2      | 2.61           | V100   | 78.28 |         79.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436-5e62567f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436.log.json) |
+| GCNet  | R-50-D8  | 769x769   |   40000 | 6.5      | 1.67           | V100   | 78.12 |         80.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814-a26f4471.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814.log.json)         |
+| GCNet  | R-101-D8 | 769x769   |   40000 | 10.5     | 1.13           | V100   | 78.95 |         80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550-ca4f0a84.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550.log.json)     |
+| GCNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 78.48 |         80.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450-ef8f069b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450.log.json)     |
+| GCNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 79.03 |         79.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.pyy) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450-778ebf69.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450.log.json) |
+| GCNet  | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 78.68 |         80.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516-4839565b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516.log.json)         |
+| GCNet  | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.18 |         80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628-8e043423.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| GCNet  | R-50-D8  | 512x512   |   80000 | 8.5      | 23.38          | V100   | 41.47 |         42.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146-91a6da41.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146.log.json)         |
+| GCNet  | R-101-D8 | 512x512   |   80000 | 12       | 15.20          | V100   | 42.82 |         44.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811-c3fcb6dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811.log.json)     |
+| GCNet  | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.37 |         43.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122-d95f3e1f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122.log.json)     |
+| GCNet  | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 43.69 |         45.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406-615528d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| GCNet  | R-50-D8  | 512x512   |   20000 | 5.8      | 23.35          | V100   | 76.42 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701-3cbfdab1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701.log.json)     |
+| GCNet  | R-101-D8 | 512x512   |   20000 | 9.2      | 14.80          | V100   | 77.41 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713-6c720aa9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713.log.json) |
+| GCNet  | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.24 |         77.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105-9797336d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105.log.json)     |
+| GCNet  | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 77.84 |         78.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806-1e38208d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,36 +66,3 @@ The Non-Local Network (NLNet) presents a pioneering approach for capturing long-
   year={2019}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| GCNet  | R-50-D8  | 512x1024  |   40000 | 5.8      | 3.93           | 77.69 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436-4b0fd17b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436.log.json)     |
-| GCNet  | R-101-D8 | 512x1024  |   40000 | 9.2      | 2.61           | 78.28 |         79.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436-5e62567f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436.log.json) |
-| GCNet  | R-50-D8  | 769x769   |   40000 | 6.5      | 1.67           | 78.12 |         80.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814-a26f4471.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814.log.json)         |
-| GCNet  | R-101-D8 | 769x769   |   40000 | 10.5     | 1.13           | 78.95 |         80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550-ca4f0a84.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550.log.json)     |
-| GCNet  | R-50-D8  | 512x1024  |   80000 | -        | -              | 78.48 |         80.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450-ef8f069b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450.log.json)     |
-| GCNet  | R-101-D8 | 512x1024  |   80000 | -        | -              | 79.03 |         79.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450-778ebf69.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450.log.json) |
-| GCNet  | R-50-D8  | 769x769   |   80000 | -        | -              | 78.68 |         80.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516-4839565b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516.log.json)         |
-| GCNet  | R-101-D8 | 769x769   |   80000 | -        | -              | 79.18 |         80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628-8e043423.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628.log.json)     |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| GCNet  | R-50-D8  | 512x512   |   80000 | 8.5      | 23.38          | 41.47 |         42.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146-91a6da41.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146.log.json)         |
-| GCNet  | R-101-D8 | 512x512   |   80000 | 12       | 15.20          | 42.82 |         44.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811-c3fcb6dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811.log.json)     |
-| GCNet  | R-50-D8  | 512x512   |  160000 | -        | -              | 42.37 |         43.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122-d95f3e1f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122.log.json)     |
-| GCNet  | R-101-D8 | 512x512   |  160000 | -        | -              | 43.69 |         45.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406-615528d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406.log.json) |
-
-### Pascal VOC 2012 + Aug
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                               |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| GCNet  | R-50-D8  | 512x512   |   20000 | 5.8      | 23.35          | 76.42 |         77.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701-3cbfdab1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701.log.json)     |
-| GCNet  | R-101-D8 | 512x512   |   20000 | 9.2      | 14.80          | 77.41 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713-6c720aa9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713.log.json) |
-| GCNet  | R-50-D8  | 512x512   |   40000 | -        | -              | 76.24 |         77.63 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105-9797336d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105.log.json)     |
-| GCNet  | R-101-D8 | 512x512   |   40000 | -        | -              | 77.84 |         78.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet/gcnet_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806-1e38208d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806.log.json) |
diff --git a/configs/gcnet/gcnet.yml b/configs/gcnet/gcnet.yml
deleted file mode 100644
index 1d5eecfc55..0000000000
--- a/configs/gcnet/gcnet.yml
+++ /dev/null
@@ -1,305 +0,0 @@
-Collections:
-- Name: GCNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://arxiv.org/abs/1904.11492
-    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
-  README: configs/gcnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/xvjiarui/GCNet
-Models:
-- Name: gcnet_r50-d8_512x1024_40k_cityscapes
-  In Collection: GCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 254.45
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.69
-      mIoU(ms+flip): 78.56
-  Config: configs/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436-4b0fd17b.pth
-- Name: gcnet_r101-d8_512x1024_40k_cityscapes
-  In Collection: GCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 383.14
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.28
-      mIoU(ms+flip): 79.34
-  Config: configs/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436-5e62567f.pth
-- Name: gcnet_r50-d8_769x769_40k_cityscapes
-  In Collection: GCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 598.8
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.12
-      mIoU(ms+flip): 80.09
-  Config: configs/gcnet/gcnet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814-a26f4471.pth
-- Name: gcnet_r101-d8_769x769_40k_cityscapes
-  In Collection: GCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 884.96
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.95
-      mIoU(ms+flip): 80.71
-  Config: configs/gcnet/gcnet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550-ca4f0a84.pth
-- Name: gcnet_r50-d8_512x1024_80k_cityscapes
-  In Collection: GCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.48
-      mIoU(ms+flip): 80.01
-  Config: configs/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450-ef8f069b.pth
-- Name: gcnet_r101-d8_512x1024_80k_cityscapes
-  In Collection: GCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.03
-      mIoU(ms+flip): 79.84
-  Config: configs/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450-778ebf69.pth
-- Name: gcnet_r50-d8_769x769_80k_cityscapes
-  In Collection: GCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.68
-      mIoU(ms+flip): 80.66
-  Config: configs/gcnet/gcnet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516-4839565b.pth
-- Name: gcnet_r101-d8_769x769_80k_cityscapes
-  In Collection: GCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.18
-      mIoU(ms+flip): 80.71
-  Config: configs/gcnet/gcnet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628-8e043423.pth
-- Name: gcnet_r50-d8_512x512_80k_ade20k
-  In Collection: GCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 42.77
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.47
-      mIoU(ms+flip): 42.85
-  Config: configs/gcnet/gcnet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146-91a6da41.pth
-- Name: gcnet_r101-d8_512x512_80k_ade20k
-  In Collection: GCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 65.79
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.82
-      mIoU(ms+flip): 44.54
-  Config: configs/gcnet/gcnet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811-c3fcb6dd.pth
-- Name: gcnet_r50-d8_512x512_160k_ade20k
-  In Collection: GCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.37
-      mIoU(ms+flip): 43.52
-  Config: configs/gcnet/gcnet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122-d95f3e1f.pth
-- Name: gcnet_r101-d8_512x512_160k_ade20k
-  In Collection: GCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.69
-      mIoU(ms+flip): 45.21
-  Config: configs/gcnet/gcnet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406-615528d7.pth
-- Name: gcnet_r50-d8_512x512_20k_voc12aug
-  In Collection: GCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 42.83
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.42
-      mIoU(ms+flip): 77.51
-  Config: configs/gcnet/gcnet_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701-3cbfdab1.pth
-- Name: gcnet_r101-d8_512x512_20k_voc12aug
-  In Collection: GCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 67.57
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.41
-      mIoU(ms+flip): 78.56
-  Config: configs/gcnet/gcnet_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713-6c720aa9.pth
-- Name: gcnet_r50-d8_512x512_40k_voc12aug
-  In Collection: GCNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.24
-      mIoU(ms+flip): 77.63
-  Config: configs/gcnet/gcnet_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105-9797336d.pth
-- Name: gcnet_r101-d8_512x512_40k_voc12aug
-  In Collection: GCNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.84
-      mIoU(ms+flip): 78.59
-  Config: configs/gcnet/gcnet_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806-1e38208d.pth
diff --git a/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..e8f7c552fb
--- /dev/null
+++ b/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..887d17b71d
--- /dev/null
+++ b/configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..aa47578d16
--- /dev/null
+++ b/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..ddf4ad7bbc
--- /dev/null
+++ b/configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..45285c0183
--- /dev/null
+++ b/configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..b466c409e8
--- /dev/null
+++ b/configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..9c7f741f05
--- /dev/null
+++ b/configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..61337dbda2
--- /dev/null
+++ b/configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './gcnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes.py b/configs/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index 27bd9422da..0000000000
--- a/configs/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './gcnet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes.py b/configs/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 7f0f83fe39..0000000000
--- a/configs/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './gcnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_512x512_160k_ade20k.py b/configs/gcnet/gcnet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 9888120f65..0000000000
--- a/configs/gcnet/gcnet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './gcnet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_512x512_20k_voc12aug.py b/configs/gcnet/gcnet_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 1b70ca8e46..0000000000
--- a/configs/gcnet/gcnet_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './gcnet_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_512x512_40k_voc12aug.py b/configs/gcnet/gcnet_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index b17c7a12b5..0000000000
--- a/configs/gcnet/gcnet_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './gcnet_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_512x512_80k_ade20k.py b/configs/gcnet/gcnet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index a2183fc2db..0000000000
--- a/configs/gcnet/gcnet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './gcnet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_769x769_40k_cityscapes.py b/configs/gcnet/gcnet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 08a6031f20..0000000000
--- a/configs/gcnet/gcnet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './gcnet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r101-d8_769x769_80k_cityscapes.py b/configs/gcnet/gcnet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 5efb61339c..0000000000
--- a/configs/gcnet/gcnet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './gcnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes.py b/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/gcnet/gcnet_r50-d8_769x769_40k_cityscapes.py b/configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/gcnet/gcnet_r50-d8_769x769_40k_cityscapes.py
rename to configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes.py b/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/gcnet/gcnet_r50-d8_769x769_80k_cityscapes.py b/configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/gcnet/gcnet_r50-d8_769x769_80k_cityscapes.py
rename to configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/gcnet/gcnet_r50-d8_512x512_160k_ade20k.py b/configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/gcnet/gcnet_r50-d8_512x512_160k_ade20k.py
rename to configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/gcnet/gcnet_r50-d8_512x512_20k_voc12aug.py b/configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/gcnet/gcnet_r50-d8_512x512_20k_voc12aug.py
rename to configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/gcnet/gcnet_r50-d8_512x512_40k_voc12aug.py b/configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/gcnet/gcnet_r50-d8_512x512_40k_voc12aug.py
rename to configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/gcnet/gcnet_r50-d8_512x512_80k_ade20k.py b/configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/gcnet/gcnet_r50-d8_512x512_80k_ade20k.py
rename to configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/gcnet/metafile.yaml b/configs/gcnet/metafile.yaml
new file mode 100644
index 0000000000..1f3c4623a0
--- /dev/null
+++ b/configs/gcnet/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: GCNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  README: configs/gcnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: gcnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.69
+      mIoU(ms+flip): 78.56
+  Config: configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436-4b0fd17b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_40k_cityscapes/gcnet_r50-d8_512x1024_40k_cityscapes_20200618_074436.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.28
+      mIoU(ms+flip): 79.34
+  Config: configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436-5e62567f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_40k_cityscapes/gcnet_r101-d8_512x1024_40k_cityscapes_20200618_074436.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.12
+      mIoU(ms+flip): 80.09
+  Config: configs/gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814-a26f4471.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_40k_cityscapes/gcnet_r50-d8_769x769_40k_cityscapes_20200618_182814.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.95
+      mIoU(ms+flip): 80.71
+  Config: configs/gcnet/gcnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550-ca4f0a84.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_40k_cityscapes/gcnet_r101-d8_769x769_40k_cityscapes_20200619_092550.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.48
+      mIoU(ms+flip): 80.01
+  Config: configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450-ef8f069b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x1024_80k_cityscapes/gcnet_r50-d8_512x1024_80k_cityscapes_20200618_074450.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.03
+      mIoU(ms+flip): 79.84
+  Config: configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450-778ebf69.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x1024_80k_cityscapes/gcnet_r101-d8_512x1024_80k_cityscapes_20200618_074450.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.68
+      mIoU(ms+flip): 80.66
+  Config: configs/gcnet/gcnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516-4839565b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_769x769_80k_cityscapes/gcnet_r50-d8_769x769_80k_cityscapes_20200619_092516.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.18
+      mIoU(ms+flip): 80.71
+  Config: configs/gcnet/gcnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628-8e043423.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_769x769_80k_cityscapes/gcnet_r101-d8_769x769_80k_cityscapes_20200619_092628.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.47
+      mIoU(ms+flip): 42.85
+  Config: configs/gcnet/gcnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146-91a6da41.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_80k_ade20k/gcnet_r50-d8_512x512_80k_ade20k_20200614_185146.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.82
+      mIoU(ms+flip): 44.54
+  Config: configs/gcnet/gcnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811-c3fcb6dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_80k_ade20k/gcnet_r101-d8_512x512_80k_ade20k_20200615_020811.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.37
+      mIoU(ms+flip): 43.52
+  Config: configs/gcnet/gcnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122-d95f3e1f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_160k_ade20k/gcnet_r50-d8_512x512_160k_ade20k_20200615_224122.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.69
+      mIoU(ms+flip): 45.21
+  Config: configs/gcnet/gcnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406-615528d7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_160k_ade20k/gcnet_r101-d8_512x512_160k_ade20k_20200615_225406.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.42
+      mIoU(ms+flip): 77.51
+  Config: configs/gcnet/gcnet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701-3cbfdab1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_20k_voc12aug/gcnet_r50-d8_512x512_20k_voc12aug_20200617_165701.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.41
+      mIoU(ms+flip): 78.56
+  Config: configs/gcnet/gcnet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713-6c720aa9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_20k_voc12aug/gcnet_r101-d8_512x512_20k_voc12aug_20200617_165713.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.24
+      mIoU(ms+flip): 77.63
+  Config: configs/gcnet/gcnet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105-9797336d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r50-d8_512x512_40k_voc12aug/gcnet_r50-d8_512x512_40k_voc12aug_20200613_195105.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
+- Name: gcnet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: GCNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.84
+      mIoU(ms+flip): 78.59
+  Config: configs/gcnet/gcnet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - GCNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806-1e38208d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/gcnet/gcnet_r101-d8_512x512_40k_voc12aug/gcnet_r101-d8_512x512_40k_voc12aug_20200613_185806.log.json
+  Paper:
+    Title: 'GCNet: Non-local Networks Meet Squeeze-Excitation Networks and Beyond'
+    URL: https://arxiv.org/abs/1904.11492
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/gc_head.py#L10
+  Framework: PyTorch
diff --git a/configs/hrnet/README.md b/configs/hrnet/README.md
index 9ebbf4d62b..b529fc895e 100644
--- a/configs/hrnet/README.md
+++ b/configs/hrnet/README.md
@@ -1,6 +1,6 @@
 # HRNet
 
-[Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1908.07919)
+> [Deep High-Resolution Representation Learning for Human Pose Estimation](https://arxiv.org/abs/1908.07919)
 
 ## Introduction
 
@@ -22,101 +22,101 @@ High-resolution representations are essential for position-sensitive vision prob
 <img src="https://user-images.githubusercontent.com/24582831/142901680-64c285bc-669f-4924-b054-46a2f07c5427.png" width="80%"/>
 </div>
 
-## Citation
-
-```bibtext
-@inproceedings{SunXLW19,
-  title={Deep High-Resolution Representation Learning for Human Pose Estimation},
-  author={Ke Sun and Bin Xiao and Dong Liu and Jingdong Wang},
-  booktitle={CVPR},
-  year={2019}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                               |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN    | HRNetV2p-W18-Small | 512x1024  |   40000 | 1.7      | 23.74          | 73.86 |         75.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216.log.json)     |
-| FCN    | HRNetV2p-W18       | 512x1024  |   40000 | 2.9      | 12.97          | 77.19 |         78.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x1024_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216-f196fb4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216.log.json)         |
-| FCN    | HRNetV2p-W48       | 512x1024  |   40000 | 6.2      | 6.42           | 78.48 |         79.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x1024_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240-a989b146.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240.log.json)         |
-| FCN    | HRNetV2p-W18-Small | 512x1024  |   80000 | -        | -              | 75.31 |         77.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700-1462b75d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700.log.json)     |
-| FCN    | HRNetV2p-W18       | 512x1024  |   80000 | -        | -              | 78.65 |         80.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x1024_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255-4e7b345e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255.log.json)         |
-| FCN    | HRNetV2p-W48       | 512x1024  |   80000 | -        | -              | 79.93 |         80.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x1024_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606-58ea95d6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606.log.json)         |
-| FCN    | HRNetV2p-W18-Small | 512x1024  |  160000 | -        | -              | 76.31 |         78.31 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901-4a0797ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901.log.json) |
-| FCN    | HRNetV2p-W18       | 512x1024  |  160000 | -        | -              | 78.80 |         80.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x1024_160k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822-221e4a4f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822.log.json)     |
-| FCN    | HRNetV2p-W48       | 512x1024  |  160000 | -        | -              | 80.65 |         81.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x1024_160k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946-59b7973e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946.log.json)     |
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                               |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | HRNetV2p-W18-Small | 512x1024  |   40000 | 1.7      | 23.74          | V100   | 73.86 |         75.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216.log.json)     |
+| FCN    | HRNetV2p-W18       | 512x1024  |   40000 | 2.9      | 12.97          | V100   | 77.19 |         78.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216-f196fb4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216.log.json)         |
+| FCN    | HRNetV2p-W48       | 512x1024  |   40000 | 6.2      | 6.42           | V100   | 78.48 |         79.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240-a989b146.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240.log.json)         |
+| FCN    | HRNetV2p-W18-Small | 512x1024  |   80000 | -        | -              | V100   | 75.31 |         77.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700-1462b75d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700.log.json)     |
+| FCN    | HRNetV2p-W18       | 512x1024  |   80000 | -        | -              | V100   | 78.65 |         80.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255-4e7b345e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255.log.json)         |
+| FCN    | HRNetV2p-W48       | 512x1024  |   80000 | -        | -              | V100   | 79.93 |         80.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606-58ea95d6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606.log.json)         |
+| FCN    | HRNetV2p-W18-Small | 512x1024  |  160000 | -        | -              | V100   | 76.31 |         78.31 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901-4a0797ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901.log.json) |
+| FCN    | HRNetV2p-W18       | 512x1024  |  160000 | -        | -              | V100   | 78.80 |         80.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822-221e4a4f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x1024  |  160000 | -        | -              | V100   | 80.65 |         81.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946-59b7973e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946.log.json)     |
 
 ### ADE20K
 
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                            | download                                                                                                                                                                                                                                                                                                           |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 3.8      | 38.66          | 31.38 |         32.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345-77fc814a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345.log.json)     |
-| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 4.9      | 22.57          | 36.27 |         37.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910-6c9382c0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910.log.json)         |
-| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 8.2      | 21.23          | 41.90 |         43.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946-7ba5258d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946.log.json)         |
-| FCN    | HRNetV2p-W18-Small | 512x512   |  160000 | -        | -              | 33.07 |         34.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739-f1e7c2e7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739.log.json) |
-| FCN    | HRNetV2p-W18       | 512x512   |  160000 | -        | -              | 36.79 |         38.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426-ca961836.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426.log.json)     |
-| FCN    | HRNetV2p-W48       | 512x512   |  160000 | -        | -              | 42.02 |         43.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407-a52fc02c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407.log.json)     |
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                           |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 3.8      | 38.66          | V100   | 31.38 |         32.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345-77fc814a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345.log.json)     |
+| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 4.9      | 22.57          | V100   | 36.27 |         37.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910-6c9382c0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910.log.json)         |
+| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 8.2      | 21.23          | V100   | 41.90 |         43.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946-7ba5258d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946.log.json)         |
+| FCN    | HRNetV2p-W18-Small | 512x512   |  160000 | -        | -              | V100   | 33.07 |         34.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739-f1e7c2e7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |  160000 | -        | -              | V100   | 36.79 |         38.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426-ca961836.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |  160000 | -        | -              | V100   | 42.02 |         43.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407-a52fc02c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407.log.json)     |
 
 ### Pascal VOC 2012 + Aug
 
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                             | download                                                                                                                                                                                                                                                                                                               |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN    | HRNetV2p-W18-Small | 512x512   |   20000 | 1.8      | 43.36          |  65.5 |         68.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910-0aceadb4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910.log.json) |
-| FCN    | HRNetV2p-W18       | 512x512   |   20000 | 2.9      | 23.48          | 72.30 |         74.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503-488d45f7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503.log.json)     |
-| FCN    | HRNetV2p-W48       | 512x512   |   20000 | 6.2      | 22.05          | 75.87 |         78.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419-89de05cd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419.log.json)     |
-| FCN    | HRNetV2p-W18-Small | 512x512   |   40000 | -        | -              | 66.61 |         70.00 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648-4f8d6e7f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648.log.json) |
-| FCN    | HRNetV2p-W18       | 512x512   |   40000 | -        | -              | 72.90 |         75.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401-1b4b76cd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401.log.json)     |
-| FCN    | HRNetV2p-W48       | 512x512   |   40000 | -        | -              | 76.24 |         78.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111-1b0f18bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111.log.json)     |
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                               |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   20000 | 1.8      | 43.36          | V100   |  65.5 |         68.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910-0aceadb4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   20000 | 2.9      | 23.48          | V100   | 72.30 |         74.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503-488d45f7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   20000 | 6.2      | 22.05          | V100   | 75.87 |         78.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419-89de05cd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419.log.json)     |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   40000 | -        | -              | V100   | 66.61 |         70.00 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648-4f8d6e7f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   40000 | -        | -              | V100   | 72.90 |         75.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401-1b4b76cd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   40000 | -        | -              | V100   | 76.24 |         78.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111-1b0f18bc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111.log.json)     |
 
 ### Pascal Context
 
-| Method | Backbone     | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                   |
-| ------ | ------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FCN    | HRNetV2p-W48 | 480x480   |   40000 | 6.1      | 8.86           | 45.14 |         47.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_480x480_40k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context_20200911_164852-667d00b0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context-20200911_164852.log.json) |
-| FCN    | HRNetV2p-W48 | 480x480   |   80000 | -        | -              | 45.84 |         47.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_480x480_80k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context_20200911_155322-847a6711.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context-20200911_155322.log.json) |
+| Method | Backbone     | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                   |
+| ------ | ------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W48 | 480x480   |   40000 | 6.1      | 8.86           | V100   | 45.14 |         47.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context_20200911_164852-667d00b0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context-20200911_164852.log.json) |
+| FCN    | HRNetV2p-W48 | 480x480   |   80000 | -        | -              | V100   | 45.84 |         47.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context_20200911_155322-847a6711.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context-20200911_155322.log.json) |
 
 ### Pascal Context 59
 
-| Method | Backbone     | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                               |
-| ------ | ------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FCN    | HRNetV2p-W48 | 480x480   |   40000 | -        | -              | 50.33 |         52.83 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_480x480_40k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59_20210410_122738-b808b8b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59-20210410_122738.log.json) |
-| FCN    | HRNetV2p-W48 | 480x480   |   80000 | -        | -              | 51.12 |         53.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_480x480_80k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59_20210411_003240-3ae7081e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59-20210411_003240.log.json) |
+| Method | Backbone     | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                               |
+| ------ | ------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W48 | 480x480   |   40000 | -        | -              | V100   | 50.33 |         52.83 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59_20210410_122738-b808b8b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59-20210410_122738.log.json) |
+| FCN    | HRNetV2p-W48 | 480x480   |   80000 | -        | -              | V100   | 51.12 |         53.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59_20210411_003240-3ae7081e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59-20210411_003240.log.json) |
 
 ### LoveDA
 
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                           | download                                                                                                                                                                                                                                                                                                       |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.59     | 24.87          | 49.28 |         49.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x512_80k_loveda.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228-60a86a7a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228.log.json) |
-| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 12.92          | 50.81 |         50.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x512_80k_loveda.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952-93d9c3b3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952.log.json)     |
-| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 9.61           | 51.42 |         51.64 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x512_80k_loveda.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756-67072f55.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756.log.json)     |
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                       |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.59     | 24.87          | V100   | 49.28 |         49.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.pyy) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228-60a86a7a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 12.92          | V100   | 50.81 |         50.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952-93d9c3b3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 9.61           | V100   | 51.42 |         51.64 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756-67072f55.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756.log.json)     |
 
 ### Potsdam
 
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                            | download                                                                                                                                                                                                                                                                                                           |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.58     | 36.00          | 77.64 |          78.8 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_512x512_80k_potsdam.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517-ba32af63.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517.log.json) |
-| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 19.25          | 78.26 |         79.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_512x512_80k_potsdam.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517-5d0387ad.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517.log.json)     |
-| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 16.42          | 78.39 |         79.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_512x512_80k_potsdam.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601-97434c78.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601.log.json)     |
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                               | download                                                                                                                                                                                                                                                                                                           |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.58     | 36.00          | V100   | 77.64 |          78.8 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517-ba32af63.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 19.25          | V100   | 78.26 |         79.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517-5d0387ad.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 16.42          | V100   | 78.39 |         79.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601-97434c78.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601.log.json)     |
 
 ### Vaihingen
 
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                   |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.58     | 38.11          | 71.81 |          73.1 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909-b23aae02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909.log.json) |
-| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 19.55          | 72.57 |         74.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216-2ec3ae8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216.log.json)     |
-| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 17.25          | 72.50 |         73.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244-7133cb22.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244.log.json)     |
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                   |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FCN    | HRNetV2p-W18-Small | 512x512   |   80000 | 1.58     | 38.11          | V100   | 71.81 |          73.1 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909-b23aae02.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909.log.json) |
+| FCN    | HRNetV2p-W18       | 512x512   |   80000 | 2.76     | 19.55          | V100   | 72.57 |         74.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216-2ec3ae8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216.log.json)     |
+| FCN    | HRNetV2p-W48       | 512x512   |   80000 | 6.20     | 17.25          | V100   | 72.50 |         73.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244-7133cb22.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244.log.json)     |
 
 ### iSAID
 
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                              | download                                                                                                                                                                                                                                                                                                                   |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN    | HRNetV2p-W18-Small | 896x896   |   80000 | 4.95     | 13.84          | 62.30 |         62.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18s_4x4_896x896_80k_isaid.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603-3cc0769b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603.log.json) |
-| FCN    | HRNetV2p-W18       | 896x896   |   80000 | 8.30     | 7.71           | 65.06 |         65.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr18_4x4_896x896_80k_isaid.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230-49bf752e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230.log.json)     |
-| FCN    | HRNetV2p-W48       | 896x896   |   80000 | 16.89    | 7.34           | 67.80 |         68.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet/fcn_hr48_4x4_896x896_80k_isaid.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643-547fc420.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643.log.json)     |
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                             | download                                                                                                                                                                                                                                                                                                                   |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | HRNetV2p-W18-Small | 896x896   |   80000 | 4.95     | 13.84          | V100   | 62.30 |         62.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603-3cc0769b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603.log.json) |
+| FCN    | HRNetV2p-W18       | 896x896   |   80000 | 8.30     | 7.71           | V100   | 65.06 |         65.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230-49bf752e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230.log.json)     |
+| FCN    | HRNetV2p-W48       | 896x896   |   80000 | 16.89    | 7.34           | V100   | 67.80 |         68.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643-547fc420.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643.log.json)     |
 
 Note:
 
 - `896x896` is the Crop Size of iSAID dataset, which is followed by the implementation of [PointFlow: Flowing Semantics Through Points for Aerial Image Segmentation](https://arxiv.org/pdf/2103.06564.pdf)
+
+## Citation
+
+```bibtext
+@inproceedings{SunXLW19,
+  title={Deep High-Resolution Representation Learning for Human Pose Estimation},
+  author={Ke Sun and Bin Xiao and Dong Liu and Jingdong Wang},
+  booktitle={CVPR},
+  year={2019}
+}
+```
diff --git a/configs/hrnet/fcn_hr18_512x1024_160k_cityscapes.py b/configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x1024_160k_cityscapes.py
rename to configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py
diff --git a/configs/hrnet/fcn_hr18_512x1024_40k_cityscapes.py b/configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x1024_40k_cityscapes.py
rename to configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/hrnet/fcn_hr18_512x1024_80k_cityscapes.py b/configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x1024_80k_cityscapes.py
rename to configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/hrnet/fcn_hr18_512x512_160k_ade20k.py b/configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x512_160k_ade20k.py
rename to configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py
diff --git a/configs/hrnet/fcn_hr18_512x512_20k_voc12aug.py b/configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x512_20k_voc12aug.py
rename to configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/hrnet/fcn_hr18_480x480_40k_pascal_context.py b/configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-480x480.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_480x480_40k_pascal_context.py
rename to configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-480x480.py
diff --git a/configs/hrnet/fcn_hr18_480x480_40k_pascal_context_59.py b/configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_480x480_40k_pascal_context_59.py
rename to configs/hrnet/fcn_hr18_4xb4-40k_pascal-context-59-480x480.py
diff --git a/configs/hrnet/fcn_hr18_512x512_40k_voc12aug.py b/configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x512_40k_voc12aug.py
rename to configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/hrnet/fcn_hr18_512x512_80k_ade20k.py b/configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x512_80k_ade20k.py
rename to configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py
diff --git a/configs/hrnet/fcn_hr18_4x4_896x896_80k_isaid.py b/configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_4x4_896x896_80k_isaid.py
rename to configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py
diff --git a/configs/hrnet/fcn_hr18_512x512_80k_loveda.py b/configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x512_80k_loveda.py
rename to configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py
diff --git a/configs/hrnet/fcn_hr18_480x480_80k_pascal_context.py b/configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-480x480.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_480x480_80k_pascal_context.py
rename to configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-480x480.py
diff --git a/configs/hrnet/fcn_hr18_480x480_80k_pascal_context_59.py b/configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_480x480_80k_pascal_context_59.py
rename to configs/hrnet/fcn_hr18_4xb4-80k_pascal-context-59-480x480.py
diff --git a/configs/hrnet/fcn_hr18_512x512_80k_potsdam.py b/configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_512x512_80k_potsdam.py
rename to configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py
diff --git a/configs/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen.py b/configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py
similarity index 100%
rename from configs/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen.py
rename to configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py
diff --git a/configs/hrnet/fcn_hr18s_480x480_40k_pascal_context.py b/configs/hrnet/fcn_hr18s_480x480_40k_pascal_context.py
deleted file mode 100644
index d09931048f..0000000000
--- a/configs/hrnet/fcn_hr18s_480x480_40k_pascal_context.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_480x480_40k_pascal_context.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_480x480_40k_pascal_context_59.py b/configs/hrnet/fcn_hr18s_480x480_40k_pascal_context_59.py
deleted file mode 100644
index 0412c64f31..0000000000
--- a/configs/hrnet/fcn_hr18s_480x480_40k_pascal_context_59.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_480x480_40k_pascal_context_59.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_480x480_80k_pascal_context.py b/configs/hrnet/fcn_hr18s_480x480_80k_pascal_context.py
deleted file mode 100644
index 584b7135fd..0000000000
--- a/configs/hrnet/fcn_hr18s_480x480_80k_pascal_context.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_480x480_80k_pascal_context.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_480x480_80k_pascal_context_59.py b/configs/hrnet/fcn_hr18s_480x480_80k_pascal_context_59.py
deleted file mode 100644
index babd88db4e..0000000000
--- a/configs/hrnet/fcn_hr18s_480x480_80k_pascal_context_59.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_480x480_80k_pascal_context_59.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen.py b/configs/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen.py
deleted file mode 100644
index 5828fe1af2..0000000000
--- a/configs/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_4x4_512x512_80k_vaihingen.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4x4_896x896_80k_isaid.py b/configs/hrnet/fcn_hr18s_4x4_896x896_80k_isaid.py
deleted file mode 100644
index d6f6c657a5..0000000000
--- a/configs/hrnet/fcn_hr18s_4x4_896x896_80k_isaid.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_4x4_896x896_80k_isaid.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py b/configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..6ca631cbee
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb2-160k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py b/configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..ba7e9c696e
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py b/configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..26ab6210dd
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py b/configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..29cbd10cbf
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py b/configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..9dd1933349
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-20k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-480x480.py b/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000..5f88f532a3
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-40k_pascal-context-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-59-480x480.py b/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..b616fad8c2
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py b/configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..b10b282dd8
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-40k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py b/configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..f9f49360bf
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py b/configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000..ab2d2414dd
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_isaid-896x896.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.py b/configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000..dd17076c3f
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_loveda-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-480x480.py b/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000..b7b52331c7
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_pascal-context-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-59-480x480.py b/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..ccf1040d13
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py b/configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000..3a5726f5d1
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_potsdam-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py b/configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000..720c1732b0
--- /dev/null
+++ b/configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fcn_hr18_4xb4-80k_vaihingen-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x1024_160k_cityscapes.py b/configs/hrnet/fcn_hr18s_512x1024_160k_cityscapes.py
deleted file mode 100644
index ddbe3801f9..0000000000
--- a/configs/hrnet/fcn_hr18s_512x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_512x1024_160k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x1024_40k_cityscapes.py b/configs/hrnet/fcn_hr18s_512x1024_40k_cityscapes.py
deleted file mode 100644
index 4e31d26e09..0000000000
--- a/configs/hrnet/fcn_hr18s_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_512x1024_40k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x1024_80k_cityscapes.py b/configs/hrnet/fcn_hr18s_512x1024_80k_cityscapes.py
deleted file mode 100644
index ee2831d99d..0000000000
--- a/configs/hrnet/fcn_hr18s_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x512_160k_ade20k.py b/configs/hrnet/fcn_hr18s_512x512_160k_ade20k.py
deleted file mode 100644
index 22a3ce0b38..0000000000
--- a/configs/hrnet/fcn_hr18s_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x512_20k_voc12aug.py b/configs/hrnet/fcn_hr18s_512x512_20k_voc12aug.py
deleted file mode 100644
index d0de5df752..0000000000
--- a/configs/hrnet/fcn_hr18s_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_512x512_20k_voc12aug.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x512_40k_voc12aug.py b/configs/hrnet/fcn_hr18s_512x512_40k_voc12aug.py
deleted file mode 100644
index 409db3c628..0000000000
--- a/configs/hrnet/fcn_hr18s_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_512x512_40k_voc12aug.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x512_80k_ade20k.py b/configs/hrnet/fcn_hr18s_512x512_80k_ade20k.py
deleted file mode 100644
index a8400979b1..0000000000
--- a/configs/hrnet/fcn_hr18s_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_512x512_80k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x512_80k_loveda.py b/configs/hrnet/fcn_hr18s_512x512_80k_loveda.py
deleted file mode 100644
index b39769ffc2..0000000000
--- a/configs/hrnet/fcn_hr18s_512x512_80k_loveda.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './fcn_hr18_512x512_80k_loveda.py'
-model = dict(
-    backbone=dict(
-        init_cfg=dict(
-            type='Pretrained',
-            checkpoint='open-mmlab://msra/hrnetv2_w18_small'),
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr18s_512x512_80k_potsdam.py b/configs/hrnet/fcn_hr18s_512x512_80k_potsdam.py
deleted file mode 100644
index 05551271a3..0000000000
--- a/configs/hrnet/fcn_hr18s_512x512_80k_potsdam.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './fcn_hr18_512x512_80k_potsdam.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/hrnet/fcn_hr48_480x480_40k_pascal_context.py b/configs/hrnet/fcn_hr48_480x480_40k_pascal_context.py
deleted file mode 100644
index 0e2d96cb6c..0000000000
--- a/configs/hrnet/fcn_hr48_480x480_40k_pascal_context.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_480x480_40k_pascal_context.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_480x480_40k_pascal_context_59.py b/configs/hrnet/fcn_hr48_480x480_40k_pascal_context_59.py
deleted file mode 100644
index 655b460467..0000000000
--- a/configs/hrnet/fcn_hr48_480x480_40k_pascal_context_59.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_480x480_40k_pascal_context_59.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_480x480_80k_pascal_context.py b/configs/hrnet/fcn_hr48_480x480_80k_pascal_context.py
deleted file mode 100644
index e28164e3dc..0000000000
--- a/configs/hrnet/fcn_hr48_480x480_80k_pascal_context.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_480x480_80k_pascal_context.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_480x480_80k_pascal_context_59.py b/configs/hrnet/fcn_hr48_480x480_80k_pascal_context_59.py
deleted file mode 100644
index 012ad0a7d6..0000000000
--- a/configs/hrnet/fcn_hr48_480x480_80k_pascal_context_59.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_480x480_80k_pascal_context_59.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen.py b/configs/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen.py
deleted file mode 100644
index 7cb22d80f0..0000000000
--- a/configs/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_4x4_512x512_80k_vaihingen.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4x4_896x896_80k_isaid.py b/configs/hrnet/fcn_hr48_4x4_896x896_80k_isaid.py
deleted file mode 100644
index 55cf1b55bd..0000000000
--- a/configs/hrnet/fcn_hr48_4x4_896x896_80k_isaid.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_4x4_896x896_80k_isaid.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py b/configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..4aa5d94d1e
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb2-160k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py b/configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..7cb795250d
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py b/configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..3e2ce034b2
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py b/configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..89b1f04651
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py b/configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..7ca38a9a79
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-20k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py b/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000..379be1d67e
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-40k_pascal-context-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py b/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..12730dd533
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py b/configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..3e1b920c59
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-40k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py b/configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..14fd663e87
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py b/configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000..81815efa8d
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_isaid-896x896.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py b/configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000..34d23af163
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_loveda-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py b/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000..4d193d9042
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_pascal-context-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py b/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..d8b4c4aa8e
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py b/configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000..58a650004d
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_potsdam-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py b/configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000..db91ed83ef
--- /dev/null
+++ b/configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './fcn_hr18_4xb4-80k_vaihingen-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=dict(
+        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x1024_160k_cityscapes.py b/configs/hrnet/fcn_hr48_512x1024_160k_cityscapes.py
deleted file mode 100644
index 394a61c99f..0000000000
--- a/configs/hrnet/fcn_hr48_512x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_512x1024_160k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x1024_40k_cityscapes.py b/configs/hrnet/fcn_hr48_512x1024_40k_cityscapes.py
deleted file mode 100644
index d37ab1d09e..0000000000
--- a/configs/hrnet/fcn_hr48_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_512x1024_40k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x1024_80k_cityscapes.py b/configs/hrnet/fcn_hr48_512x1024_80k_cityscapes.py
deleted file mode 100644
index a9bab32b52..0000000000
--- a/configs/hrnet/fcn_hr48_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x512_160k_ade20k.py b/configs/hrnet/fcn_hr48_512x512_160k_ade20k.py
deleted file mode 100644
index dff4fea85c..0000000000
--- a/configs/hrnet/fcn_hr48_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x512_20k_voc12aug.py b/configs/hrnet/fcn_hr48_512x512_20k_voc12aug.py
deleted file mode 100644
index a8d1deb986..0000000000
--- a/configs/hrnet/fcn_hr48_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_512x512_20k_voc12aug.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x512_40k_voc12aug.py b/configs/hrnet/fcn_hr48_512x512_40k_voc12aug.py
deleted file mode 100644
index 1084a57e97..0000000000
--- a/configs/hrnet/fcn_hr48_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_512x512_40k_voc12aug.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x512_80k_ade20k.py b/configs/hrnet/fcn_hr48_512x512_80k_ade20k.py
deleted file mode 100644
index 7eca7fa4b8..0000000000
--- a/configs/hrnet/fcn_hr48_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_512x512_80k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x512_80k_loveda.py b/configs/hrnet/fcn_hr48_512x512_80k_loveda.py
deleted file mode 100644
index 269dbf662d..0000000000
--- a/configs/hrnet/fcn_hr48_512x512_80k_loveda.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './fcn_hr18_512x512_80k_loveda.py'
-model = dict(
-    backbone=dict(
-        init_cfg=dict(
-            type='Pretrained', checkpoint='open-mmlab://msra/hrnetv2_w48'),
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/fcn_hr48_512x512_80k_potsdam.py b/configs/hrnet/fcn_hr48_512x512_80k_potsdam.py
deleted file mode 100644
index 608fee387b..0000000000
--- a/configs/hrnet/fcn_hr48_512x512_80k_potsdam.py
+++ /dev/null
@@ -1,10 +0,0 @@
-_base_ = './fcn_hr18_512x512_80k_potsdam.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=dict(
-        in_channels=[48, 96, 192, 384], channels=sum([48, 96, 192, 384])))
diff --git a/configs/hrnet/hrnet.yml b/configs/hrnet/hrnet.yml
deleted file mode 100644
index 960a93708b..0000000000
--- a/configs/hrnet/hrnet.yml
+++ /dev/null
@@ -1,695 +0,0 @@
-Models:
-- Name: fcn_hr18s_512x1024_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 42.12
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 1.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.86
-      mIoU(ms+flip): 75.91
-  Config: configs/hrnet/fcn_hr18s_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth
-- Name: fcn_hr18_512x1024_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 77.1
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 2.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.19
-      mIoU(ms+flip): 78.92
-  Config: configs/hrnet/fcn_hr18_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216-f196fb4e.pth
-- Name: fcn_hr48_512x1024_40k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 155.76
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.48
-      mIoU(ms+flip): 79.69
-  Config: configs/hrnet/fcn_hr48_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240-a989b146.pth
-- Name: fcn_hr18s_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.31
-      mIoU(ms+flip): 77.48
-  Config: configs/hrnet/fcn_hr18s_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700-1462b75d.pth
-- Name: fcn_hr18_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.65
-      mIoU(ms+flip): 80.35
-  Config: configs/hrnet/fcn_hr18_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255-4e7b345e.pth
-- Name: fcn_hr48_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.93
-      mIoU(ms+flip): 80.72
-  Config: configs/hrnet/fcn_hr48_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606-58ea95d6.pth
-- Name: fcn_hr18s_512x1024_160k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.31
-      mIoU(ms+flip): 78.31
-  Config: configs/hrnet/fcn_hr18s_512x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901-4a0797ea.pth
-- Name: fcn_hr18_512x1024_160k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.8
-      mIoU(ms+flip): 80.74
-  Config: configs/hrnet/fcn_hr18_512x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822-221e4a4f.pth
-- Name: fcn_hr48_512x1024_160k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.65
-      mIoU(ms+flip): 81.92
-  Config: configs/hrnet/fcn_hr48_512x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946-59b7973e.pth
-- Name: fcn_hr18s_512x512_80k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 25.87
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 3.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 31.38
-      mIoU(ms+flip): 32.45
-  Config: configs/hrnet/fcn_hr18s_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345-77fc814a.pth
-- Name: fcn_hr18_512x512_80k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 44.31
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 4.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 36.27
-      mIoU(ms+flip): 37.28
-  Config: configs/hrnet/fcn_hr18_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910-6c9382c0.pth
-- Name: fcn_hr48_512x512_80k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 47.1
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.9
-      mIoU(ms+flip): 43.27
-  Config: configs/hrnet/fcn_hr48_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946-7ba5258d.pth
-- Name: fcn_hr18s_512x512_160k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 33.07
-      mIoU(ms+flip): 34.56
-  Config: configs/hrnet/fcn_hr18s_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739-f1e7c2e7.pth
-- Name: fcn_hr18_512x512_160k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 36.79
-      mIoU(ms+flip): 38.58
-  Config: configs/hrnet/fcn_hr18_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426-ca961836.pth
-- Name: fcn_hr48_512x512_160k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.02
-      mIoU(ms+flip): 43.86
-  Config: configs/hrnet/fcn_hr48_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407-a52fc02c.pth
-- Name: fcn_hr18s_512x512_20k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 23.06
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 65.5
-      mIoU(ms+flip): 68.89
-  Config: configs/hrnet/fcn_hr18s_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910-0aceadb4.pth
-- Name: fcn_hr18_512x512_20k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 42.59
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 2.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 72.3
-      mIoU(ms+flip): 74.71
-  Config: configs/hrnet/fcn_hr18_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503-488d45f7.pth
-- Name: fcn_hr48_512x512_20k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 45.35
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 75.87
-      mIoU(ms+flip): 78.58
-  Config: configs/hrnet/fcn_hr48_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419-89de05cd.pth
-- Name: fcn_hr18s_512x512_40k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 66.61
-      mIoU(ms+flip): 70.0
-  Config: configs/hrnet/fcn_hr18s_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648-4f8d6e7f.pth
-- Name: fcn_hr18_512x512_40k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 72.9
-      mIoU(ms+flip): 75.59
-  Config: configs/hrnet/fcn_hr18_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401-1b4b76cd.pth
-- Name: fcn_hr48_512x512_40k_voc12aug
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.24
-      mIoU(ms+flip): 78.49
-  Config: configs/hrnet/fcn_hr48_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111-1b0f18bc.pth
-- Name: fcn_hr48_480x480_40k_pascal_context
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (480,480)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 112.87
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (480,480)
-    Training Memory (GB): 6.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 45.14
-      mIoU(ms+flip): 47.42
-  Config: configs/hrnet/fcn_hr48_480x480_40k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context_20200911_164852-667d00b0.pth
-- Name: fcn_hr48_480x480_80k_pascal_context
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 45.84
-      mIoU(ms+flip): 47.84
-  Config: configs/hrnet/fcn_hr48_480x480_80k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context_20200911_155322-847a6711.pth
-- Name: fcn_hr48_480x480_40k_pascal_context_59
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (480,480)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 50.33
-      mIoU(ms+flip): 52.83
-  Config: configs/hrnet/fcn_hr48_480x480_40k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59_20210410_122738-b808b8b2.pth
-- Name: fcn_hr48_480x480_80k_pascal_context_59
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 51.12
-      mIoU(ms+flip): 53.56
-  Config: configs/hrnet/fcn_hr48_480x480_80k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59_20210411_003240-3ae7081e.pth
-- Name: fcn_hr18s_512x512_80k_loveda
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 40.21
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.59
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 49.28
-      mIoU(ms+flip): 49.42
-  Config: configs/hrnet/fcn_hr18s_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228-60a86a7a.pth
-- Name: fcn_hr18_512x512_80k_loveda
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 77.4
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 2.76
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 50.81
-      mIoU(ms+flip): 50.95
-  Config: configs/hrnet/fcn_hr18_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952-93d9c3b3.pth
-- Name: fcn_hr48_512x512_80k_loveda
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 104.06
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 51.42
-      mIoU(ms+flip): 51.64
-  Config: configs/hrnet/fcn_hr48_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756-67072f55.pth
-- Name: fcn_hr18s_512x512_80k_potsdam
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 27.78
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.58
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 77.64
-      mIoU(ms+flip): 78.8
-  Config: configs/hrnet/fcn_hr18s_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517-ba32af63.pth
-- Name: fcn_hr18_512x512_80k_potsdam
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 51.95
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 2.76
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 78.26
-      mIoU(ms+flip): 79.24
-  Config: configs/hrnet/fcn_hr18_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517-5d0387ad.pth
-- Name: fcn_hr48_512x512_80k_potsdam
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 60.9
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 78.39
-      mIoU(ms+flip): 79.34
-  Config: configs/hrnet/fcn_hr48_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601-97434c78.pth
-- Name: fcn_hr18s_4x4_512x512_80k_vaihingen
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 26.24
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.58
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 71.81
-      mIoU(ms+flip): 73.1
-  Config: configs/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909-b23aae02.pth
-- Name: fcn_hr18_4x4_512x512_80k_vaihingen
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 51.15
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 2.76
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 72.57
-      mIoU(ms+flip): 74.09
-  Config: configs/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216-2ec3ae8a.pth
-- Name: fcn_hr48_4x4_512x512_80k_vaihingen
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 57.97
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 72.5
-      mIoU(ms+flip): 73.52
-  Config: configs/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244-7133cb22.pth
-- Name: fcn_hr18s_4x4_896x896_80k_isaid
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (896,896)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 72.25
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (896,896)
-    Training Memory (GB): 4.95
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: iSAID
-    Metrics:
-      mIoU: 62.3
-      mIoU(ms+flip): 62.97
-  Config: configs/hrnet/fcn_hr18s_4x4_896x896_80k_isaid.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603-3cc0769b.pth
-- Name: fcn_hr18_4x4_896x896_80k_isaid
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (896,896)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 129.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (896,896)
-    Training Memory (GB): 8.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: iSAID
-    Metrics:
-      mIoU: 65.06
-      mIoU(ms+flip): 65.6
-  Config: configs/hrnet/fcn_hr18_4x4_896x896_80k_isaid.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230-49bf752e.pth
-- Name: fcn_hr48_4x4_896x896_80k_isaid
-  In Collection: FCN
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (896,896)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 136.24
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (896,896)
-    Training Memory (GB): 16.89
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: iSAID
-    Metrics:
-      mIoU: 67.8
-      mIoU(ms+flip): 68.53
-  Config: configs/hrnet/fcn_hr48_4x4_896x896_80k_isaid.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643-547fc420.pth
diff --git a/configs/hrnet/metafile.yaml b/configs/hrnet/metafile.yaml
new file mode 100644
index 0000000000..11c30165a5
--- /dev/null
+++ b/configs/hrnet/metafile.yaml
@@ -0,0 +1,874 @@
+Models:
+- Name: fcn_hr18s_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.86
+      mIoU(ms+flip): 75.91
+  Config: configs/hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216-93db27d0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_40k_cityscapes/fcn_hr18s_512x1024_40k_cityscapes_20200601_014216.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.19
+      mIoU(ms+flip): 78.92
+  Config: configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216-f196fb4e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_40k_cityscapes/fcn_hr18_512x1024_40k_cityscapes_20200601_014216.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb2-40k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.48
+      mIoU(ms+flip): 79.69
+  Config: configs/hrnet/fcn_hr48_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240-a989b146.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_40k_cityscapes/fcn_hr48_512x1024_40k_cityscapes_20200601_014240.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.31
+      mIoU(ms+flip): 77.48
+  Config: configs/hrnet/fcn_hr18s_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700-1462b75d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_80k_cityscapes/fcn_hr18s_512x1024_80k_cityscapes_20200601_202700.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.65
+      mIoU(ms+flip): 80.35
+  Config: configs/hrnet/fcn_hr18_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255-4e7b345e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_80k_cityscapes/fcn_hr18_512x1024_80k_cityscapes_20200601_223255.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.93
+      mIoU(ms+flip): 80.72
+  Config: configs/hrnet/fcn_hr48_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606-58ea95d6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_80k_cityscapes/fcn_hr48_512x1024_80k_cityscapes_20200601_202606.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb2-160k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.31
+      mIoU(ms+flip): 78.31
+  Config: configs/hrnet/fcn_hr18s_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901-4a0797ea.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x1024_160k_cityscapes/fcn_hr18s_512x1024_160k_cityscapes_20200602_190901.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb2-160k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.8
+      mIoU(ms+flip): 80.74
+  Config: configs/hrnet/fcn_hr18_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822-221e4a4f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x1024_160k_cityscapes/fcn_hr18_512x1024_160k_cityscapes_20200602_190822.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb2-160k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.65
+      mIoU(ms+flip): 81.92
+  Config: configs/hrnet/fcn_hr48_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946-59b7973e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x1024_160k_cityscapes/fcn_hr48_512x1024_160k_cityscapes_20200602_190946.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 31.38
+      mIoU(ms+flip): 32.45
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345-77fc814a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_ade20k/fcn_hr18s_512x512_80k_ade20k_20200614_144345.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 36.27
+      mIoU(ms+flip): 37.28
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910-6c9382c0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_ade20k/fcn_hr18_512x512_80k_ade20k_20210827_114910.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.9
+      mIoU(ms+flip): 43.27
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946-7ba5258d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_ade20k/fcn_hr48_512x512_80k_ade20k_20200614_193946.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 33.07
+      mIoU(ms+flip): 34.56
+  Config: configs/hrnet/fcn_hr18s_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739-f1e7c2e7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_160k_ade20k/fcn_hr18s_512x512_160k_ade20k_20210829_174739.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 36.79
+      mIoU(ms+flip): 38.58
+  Config: configs/hrnet/fcn_hr18_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426-ca961836.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_160k_ade20k/fcn_hr18_512x512_160k_ade20k_20200614_214426.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.02
+      mIoU(ms+flip): 43.86
+  Config: configs/hrnet/fcn_hr48_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407-a52fc02c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_160k_ade20k/fcn_hr48_512x512_160k_ade20k_20200614_214407.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 65.5
+      mIoU(ms+flip): 68.89
+  Config: configs/hrnet/fcn_hr18s_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910-0aceadb4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_20k_voc12aug/fcn_hr18s_512x512_20k_voc12aug_20210829_174910.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 72.3
+      mIoU(ms+flip): 74.71
+  Config: configs/hrnet/fcn_hr18_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503-488d45f7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_20k_voc12aug/fcn_hr18_512x512_20k_voc12aug_20200617_224503.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-20k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 75.87
+      mIoU(ms+flip): 78.58
+  Config: configs/hrnet/fcn_hr48_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419-89de05cd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_20k_voc12aug/fcn_hr48_512x512_20k_voc12aug_20200617_224419.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 66.61
+      mIoU(ms+flip): 70.0
+  Config: configs/hrnet/fcn_hr18s_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648-4f8d6e7f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_40k_voc12aug/fcn_hr18s_512x512_40k_voc12aug_20200614_000648.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 72.9
+      mIoU(ms+flip): 75.59
+  Config: configs/hrnet/fcn_hr18_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401-1b4b76cd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_40k_voc12aug/fcn_hr18_512x512_40k_voc12aug_20200613_224401.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-40k_voc12aug-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.24
+      mIoU(ms+flip): 78.49
+  Config: configs/hrnet/fcn_hr48_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111-1b0f18bc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_40k_voc12aug/fcn_hr48_512x512_40k_voc12aug_20200613_222111.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-40k_pascal-context-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 45.14
+      mIoU(ms+flip): 47.42
+  Config: configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context_20200911_164852-667d00b0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context/fcn_hr48_480x480_40k_pascal_context-20200911_164852.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_pascal-context-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 45.84
+      mIoU(ms+flip): 47.84
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context_20200911_155322-847a6711.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context/fcn_hr48_480x480_80k_pascal_context-20200911_155322.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-40k_pascal-context-59-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 50.33
+      mIoU(ms+flip): 52.83
+  Config: configs/hrnet/fcn_hr48_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59_20210410_122738-b808b8b2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_40k_pascal_context_59/fcn_hr48_480x480_40k_pascal_context_59-20210410_122738.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_pascal-context-59-480x480
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 51.12
+      mIoU(ms+flip): 53.56
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59_20210411_003240-3ae7081e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_480x480_80k_pascal_context_59/fcn_hr48_480x480_80k_pascal_context_59-20210411_003240.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_loveda-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 49.28
+      mIoU(ms+flip): 49.42
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.59
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228-60a86a7a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_loveda/fcn_hr18s_512x512_80k_loveda_20211210_203228.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_loveda-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 50.81
+      mIoU(ms+flip): 50.95
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.76
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952-93d9c3b3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_loveda/fcn_hr18_512x512_80k_loveda_20211210_203952.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_loveda-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 51.42
+      mIoU(ms+flip): 51.64
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756-67072f55.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_loveda/fcn_hr48_512x512_80k_loveda_20211211_044756.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_potsdam-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 77.64
+      mIoU(ms+flip): 78.8
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.58
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517-ba32af63.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_512x512_80k_potsdam/fcn_hr18s_512x512_80k_potsdam_20211218_205517.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_potsdam-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.26
+      mIoU(ms+flip): 79.24
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.76
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517-5d0387ad.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_512x512_80k_potsdam/fcn_hr18_512x512_80k_potsdam_20211218_205517.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_potsdam-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.39
+      mIoU(ms+flip): 79.34
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601-97434c78.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_512x512_80k_potsdam/fcn_hr48_512x512_80k_potsdam_20211219_020601.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_vaihingen-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 71.81
+      mIoU(ms+flip): 73.1
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.58
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909-b23aae02.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_512x512_80k_vaihingen/fcn_hr18s_4x4_512x512_80k_vaihingen_20211231_230909.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_vaihingen-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.57
+      mIoU(ms+flip): 74.09
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.76
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216-2ec3ae8a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_512x512_80k_vaihingen/fcn_hr18_4x4_512x512_80k_vaihingen_20211231_231216.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_vaihingen-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.5
+      mIoU(ms+flip): 73.52
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244-7133cb22.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_512x512_80k_vaihingen/fcn_hr48_4x4_512x512_80k_vaihingen_20211231_231244.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18s_4xb4-80k_isaid-896x896
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 62.3
+      mIoU(ms+flip): 62.97
+  Config: configs/hrnet/fcn_hr18s_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.95
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603-3cc0769b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18s_4x4_896x896_80k_isaid/fcn_hr18s_4x4_896x896_80k_isaid_20220118_001603.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr18_4xb4-80k_isaid-896x896
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 65.06
+      mIoU(ms+flip): 65.6
+  Config: configs/hrnet/fcn_hr18_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230-49bf752e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr18_4x4_896x896_80k_isaid/fcn_hr18_4x4_896x896_80k_isaid_20220110_182230.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
+- Name: fcn_hr48_4xb4-80k_isaid-896x896
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 67.8
+      mIoU(ms+flip): 68.53
+  Config: configs/hrnet/fcn_hr48_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 16.89
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643-547fc420.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/hrnet/fcn_hr48_4x4_896x896_80k_isaid/fcn_hr48_4x4_896x896_80k_isaid_20220114_174643.log.json
+  Paper:
+    Title: Deep High-Resolution Representation Learning for Human Pose Estimation
+    URL: https://arxiv.org/abs/1908.07919
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/hrnet.py#L218
+  Framework: PyTorch
diff --git a/configs/icnet/README.md b/configs/icnet/README.md
index c011af5b09..fa2327fc39 100644
--- a/configs/icnet/README.md
+++ b/configs/icnet/README.md
@@ -1,6 +1,6 @@
 # ICNet
 
-[ICNet for Real-time Semantic Segmentation on High-resolution Images](https://arxiv.org/abs/1704.08545)
+> [ICNet for Real-time Semantic Segmentation on High-resolution Images](https://arxiv.org/abs/1704.08545)
 
 ## Introduction
 
@@ -22,6 +22,27 @@ We focus on the challenging task of real-time semantic segmentation in this pape
 <img src="https://user-images.githubusercontent.com/24582831/142901772-4570455d-7b27-44ae-a690-47dd9fde8445.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ICNet            | R-18-D8  | 832x832   |   80000 | 1.70     | 27.12          | V100   | 68.14 |         70.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521-2e36638d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521.log.json)                                             |
+| ICNet            | R-18-D8  | 832x832   |  160000 | -        | -              | V100   | 71.64 |         74.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153-2c6eb6e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153.log.json)                                         |
+| ICNet (in1k-pre) | R-18-D8  | 832x832   |   80000 | -        | -              | V100   | 72.51 |         74.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354-1cbe3022.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354.log.json)         |
+| ICNet (in1k-pre) | R-18-D8  | 832x832   |  160000 | -        | -              | V100   | 74.43 |         76.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702-619c8ae1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702.log.json)     |
+| ICNet            | R-50-D8  | 832x832   |   80000 | 2.53     | 20.08          | V100   | 68.91 |         69.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625-c6407341.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625.log.json)                                             |
+| ICNet            | R-50-D8  | 832x832   |  160000 | -        | -              | V100   | 73.82 |         75.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612-a95f0d4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612.log.json)                                         |
+| ICNet (in1k-pre) | R-50-D8  | 832x832   |   80000 | -        | -              | V100   | 74.58 |         76.41 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943-1743dc7b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943.log.json)         |
+| ICNet (in1k-pre) | R-50-D8  | 832x832   |  160000 | -        | -              | V100   | 76.29 |         78.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715-ce310aea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715.log.json)     |
+| ICNet            | R-101-D8 | 832x832   |   80000 | 3.08     | 16.95          | V100   | 70.28 |         71.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447-b52f936e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447.log.json)                                         |
+| ICNet            | R-101-D8 | 832x832   |  160000 | -        | -              | V100   | 73.80 |         76.10 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350-3a1ebf1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350.log.json)                                     |
+| ICNet (in1k-pre) | R-101-D8 | 832x832   |   80000 | -        | -              | V100   | 75.57 |         77.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414-7ceb12c5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414.log.json)     |
+| ICNet (in1k-pre) | R-101-D8 | 832x832   |  160000 | -        | -              | V100   | 76.15 |         77.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612-9484ae8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612.log.json) |
+
+Note: `in1k-pre` means pretrained model is used.
+
 ## Citation
 
 ```bibtext
@@ -33,24 +54,3 @@ We focus on the challenging task of real-time semantic segmentation in this pape
   year={2018}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                               |
-| ---------------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ICNet            | R-18-D8  | 832x832   |   80000 | 1.70     | 27.12          | 68.14 |         70.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521-2e36638d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521.log.json)                                             |
-| ICNet            | R-18-D8  | 832x832   |  160000 | -        | -              | 71.64 |         74.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153-2c6eb6e0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153.log.json)                                         |
-| ICNet (in1k-pre) | R-18-D8  | 832x832   |   80000 | -        | -              | 72.51 |         74.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354-1cbe3022.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354.log.json)         |
-| ICNet (in1k-pre) | R-18-D8  | 832x832   |  160000 | -        | -              | 74.43 |         76.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702-619c8ae1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702.log.json)     |
-| ICNet            | R-50-D8  | 832x832   |   80000 | 2.53     | 20.08          | 68.91 |         69.72 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r50-d8_832x832_80k_cityscapes.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625-c6407341.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625.log.json)                                             |
-| ICNet            | R-50-D8  | 832x832   |  160000 | -        | -              | 73.82 |         75.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r50-d8_832x832_160k_cityscapes.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612-a95f0d4e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612.log.json)                                         |
-| ICNet (in1k-pre) | R-50-D8  | 832x832   |   80000 | -        | -              | 74.58 |         76.41 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943-1743dc7b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943.log.json)         |
-| ICNet (in1k-pre) | R-50-D8  | 832x832   |  160000 | -        | -              | 76.29 |         78.09 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715-ce310aea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715.log.json)     |
-| ICNet            | R-101-D8 | 832x832   |   80000 | 3.08     | 16.95          | 70.28 |         71.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447-b52f936e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447.log.json)                                         |
-| ICNet            | R-101-D8 | 832x832   |  160000 | -        | -              | 73.80 |         76.10 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350-3a1ebf1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350.log.json)                                     |
-| ICNet (in1k-pre) | R-101-D8 | 832x832   |   80000 | -        | -              | 75.57 |         77.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414-7ceb12c5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414.log.json)     |
-| ICNet (in1k-pre) | R-101-D8 | 832x832   |  160000 | -        | -              | 76.15 |         77.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612-9484ae8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612.log.json) |
-
-Note: `in1k-pre` means pretrained model is used.
diff --git a/configs/icnet/icnet.yml b/configs/icnet/icnet.yml
deleted file mode 100644
index ebaf9340b0..0000000000
--- a/configs/icnet/icnet.yml
+++ /dev/null
@@ -1,207 +0,0 @@
-Collections:
-- Name: ICNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/1704.08545
-    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
-  README: configs/icnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
-    Version: v0.18.0
-  Converted From:
-    Code: https://github.com/hszhao/ICNet
-Models:
-- Name: icnet_r18-d8_832x832_80k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (832,832)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 36.87
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (832,832)
-    Training Memory (GB): 1.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 68.14
-      mIoU(ms+flip): 70.16
-  Config: configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521-2e36638d.pth
-- Name: icnet_r18-d8_832x832_160k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (832,832)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 71.64
-      mIoU(ms+flip): 74.18
-  Config: configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153-2c6eb6e0.pth
-- Name: icnet_r18-d8_in1k-pre_832x832_80k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (832,832)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 72.51
-      mIoU(ms+flip): 74.78
-  Config: configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354-1cbe3022.pth
-- Name: icnet_r18-d8_in1k-pre_832x832_160k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (832,832)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.43
-      mIoU(ms+flip): 76.72
-  Config: configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702-619c8ae1.pth
-- Name: icnet_r50-d8_832x832_80k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (832,832)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 49.8
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (832,832)
-    Training Memory (GB): 2.53
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 68.91
-      mIoU(ms+flip): 69.72
-  Config: configs/icnet/icnet_r50-d8_832x832_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625-c6407341.pth
-- Name: icnet_r50-d8_832x832_160k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (832,832)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.82
-      mIoU(ms+flip): 75.67
-  Config: configs/icnet/icnet_r50-d8_832x832_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612-a95f0d4e.pth
-- Name: icnet_r50-d8_in1k-pre_832x832_80k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (832,832)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.58
-      mIoU(ms+flip): 76.41
-  Config: configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943-1743dc7b.pth
-- Name: icnet_r50-d8_in1k-pre_832x832_160k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (832,832)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.29
-      mIoU(ms+flip): 78.09
-  Config: configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715-ce310aea.pth
-- Name: icnet_r101-d8_832x832_80k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (832,832)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 59.0
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (832,832)
-    Training Memory (GB): 3.08
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 70.28
-      mIoU(ms+flip): 71.95
-  Config: configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447-b52f936e.pth
-- Name: icnet_r101-d8_832x832_160k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (832,832)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.8
-      mIoU(ms+flip): 76.1
-  Config: configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350-3a1ebf1a.pth
-- Name: icnet_r101-d8_in1k-pre_832x832_80k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (832,832)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.57
-      mIoU(ms+flip): 77.86
-  Config: configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414-7ceb12c5.pth
-- Name: icnet_r101-d8_in1k-pre_832x832_160k_cityscapes
-  In Collection: ICNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (832,832)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.15
-      mIoU(ms+flip): 77.98
-  Config: configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612-9484ae8a.pth
diff --git a/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py b/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000..a6840a1155
--- /dev/null
+++ b/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,7 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            depth=101,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py b/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000..ca81df8c7b
--- /dev/null
+++ b/configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,7 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            depth=101,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py b/configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000..ef60446bc5
--- /dev/null
+++ b/configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,2 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(backbone=dict(backbone_cfg=dict(depth=101)))
diff --git a/configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py b/configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000..5173d2d6f8
--- /dev/null
+++ b/configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,2 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(backbone=dict(backbone_cfg=dict(depth=101)))
diff --git a/configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py b/configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py
deleted file mode 100644
index 24cbf537d4..0000000000
--- a/configs/icnet/icnet_r101-d8_832x832_160k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
-model = dict(backbone=dict(backbone_cfg=dict(depth=101)))
diff --git a/configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py b/configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py
deleted file mode 100644
index f3338b5944..0000000000
--- a/configs/icnet/icnet_r101-d8_832x832_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
-model = dict(backbone=dict(backbone_cfg=dict(depth=101)))
diff --git a/configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py b/configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py
deleted file mode 100644
index 74ac355088..0000000000
--- a/configs/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes.py
+++ /dev/null
@@ -1,7 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
-model = dict(
-    backbone=dict(
-        backbone_cfg=dict(
-            depth=101,
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py b/configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py
deleted file mode 100644
index b4ba6d640d..0000000000
--- a/configs/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes.py
+++ /dev/null
@@ -1,7 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
-model = dict(
-    backbone=dict(
-        backbone_cfg=dict(
-            depth=101,
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet101_v1c'))))
diff --git a/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py b/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000..5f72daab65
--- /dev/null
+++ b/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,8 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        layer_channels=(128, 512),
+        backbone_cfg=dict(
+            depth=18,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
diff --git a/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py b/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000..2fc79ab197
--- /dev/null
+++ b/configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,8 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        layer_channels=(128, 512),
+        backbone_cfg=dict(
+            depth=18,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
diff --git a/configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py b/configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000..2c70e94810
--- /dev/null
+++ b/configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,3 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(layer_channels=(128, 512), backbone_cfg=dict(depth=18)))
diff --git a/configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py b/configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000..23c7ac2990
--- /dev/null
+++ b/configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,3 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(layer_channels=(128, 512), backbone_cfg=dict(depth=18)))
diff --git a/configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py b/configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py
deleted file mode 100644
index 877b775afc..0000000000
--- a/configs/icnet/icnet_r18-d8_832x832_160k_cityscapes.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
-model = dict(
-    backbone=dict(layer_channels=(128, 512), backbone_cfg=dict(depth=18)))
diff --git a/configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py b/configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py
deleted file mode 100644
index 786c7cc92a..0000000000
--- a/configs/icnet/icnet_r18-d8_832x832_80k_cityscapes.py
+++ /dev/null
@@ -1,3 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
-model = dict(
-    backbone=dict(layer_channels=(128, 512), backbone_cfg=dict(depth=18)))
diff --git a/configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py b/configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py
deleted file mode 100644
index cc47951f3d..0000000000
--- a/configs/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
-model = dict(
-    backbone=dict(
-        layer_channels=(128, 512),
-        backbone_cfg=dict(
-            depth=18,
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
diff --git a/configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py b/configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py
deleted file mode 100644
index 00b0fe0522..0000000000
--- a/configs/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
-model = dict(
-    backbone=dict(
-        layer_channels=(128, 512),
-        backbone_cfg=dict(
-            depth=18,
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet18_v1c'))))
diff --git a/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py b/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
new file mode 100644
index 0000000000..f9ab863402
--- /dev/null
+++ b/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
@@ -0,0 +1,6 @@
+_base_ = './icnet_r50-d8_4xb2-160k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py b/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
new file mode 100644
index 0000000000..9a085d4f61
--- /dev/null
+++ b/configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
@@ -0,0 +1,6 @@
+_base_ = './icnet_r50-d8_4xb2-80k_cityscapes-832x832.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(
+                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/configs/icnet/icnet_r50-d8_832x832_160k_cityscapes.py b/configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py
similarity index 100%
rename from configs/icnet/icnet_r50-d8_832x832_160k_cityscapes.py
rename to configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py
diff --git a/configs/icnet/icnet_r50-d8_832x832_80k_cityscapes.py b/configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py
similarity index 100%
rename from configs/icnet/icnet_r50-d8_832x832_80k_cityscapes.py
rename to configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py
diff --git a/configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py b/configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py
deleted file mode 100644
index 6f7a0a1a36..0000000000
--- a/configs/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_160k_cityscapes.py'
-model = dict(
-    backbone=dict(
-        backbone_cfg=dict(
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py b/configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py
deleted file mode 100644
index 57546cd291..0000000000
--- a/configs/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './icnet_r50-d8_832x832_80k_cityscapes.py'
-model = dict(
-    backbone=dict(
-        backbone_cfg=dict(
-            init_cfg=dict(
-                type='Pretrained', checkpoint='open-mmlab://resnet50_v1c'))))
diff --git a/configs/icnet/metafile.yaml b/configs/icnet/metafile.yaml
new file mode 100644
index 0000000000..1d843ee4b6
--- /dev/null
+++ b/configs/icnet/metafile.yaml
@@ -0,0 +1,298 @@
+Collections:
+- Name: ICNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  README: configs/icnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: icnet_r18-d8_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 68.14
+      mIoU(ms+flip): 70.16
+  Config: configs/icnet/icnet_r18-d8_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521-2e36638d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_80k_cityscapes/icnet_r18-d8_832x832_80k_cityscapes_20210925_225521.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r18-d8_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.64
+      mIoU(ms+flip): 74.18
+  Config: configs/icnet/icnet_r18-d8_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153-2c6eb6e0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_832x832_160k_cityscapes/icnet_r18-d8_832x832_160k_cityscapes_20210925_230153.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.51
+      mIoU(ms+flip): 74.78
+  Config: configs/icnet/icnet_r18-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354-1cbe3022.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes/icnet_r18-d8_in1k-pre_832x832_80k_cityscapes_20210925_230354.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.43
+      mIoU(ms+flip): 76.72
+  Config: configs/icnet/icnet_r18-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702-619c8ae1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes/icnet_r18-d8_in1k-pre_832x832_160k_cityscapes_20210926_052702.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r50-d8_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 68.91
+      mIoU(ms+flip): 69.72
+  Config: configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.53
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625-c6407341.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_80k_cityscapes/icnet_r50-d8_832x832_80k_cityscapes_20210926_044625.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r50-d8_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.82
+      mIoU(ms+flip): 75.67
+  Config: configs/icnet/icnet_r50-d8_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612-a95f0d4e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_832x832_160k_cityscapes/icnet_r50-d8_832x832_160k_cityscapes_20210925_232612.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.58
+      mIoU(ms+flip): 76.41
+  Config: configs/icnet/icnet_r50-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943-1743dc7b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes/icnet_r50-d8_in1k-pre_832x832_80k_cityscapes_20210926_032943.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.29
+      mIoU(ms+flip): 78.09
+  Config: configs/icnet/icnet_r50-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715-ce310aea.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes/icnet_r50-d8_in1k-pre_832x832_160k_cityscapes_20210926_042715.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r101-d8_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.28
+      mIoU(ms+flip): 71.95
+  Config: configs/icnet/icnet_r101-d8_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.08
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447-b52f936e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_80k_cityscapes/icnet_r101-d8_832x832_80k_cityscapes_20210926_072447.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r101-d8_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.8
+      mIoU(ms+flip): 76.1
+  Config: configs/icnet/icnet_r101-d8_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ICNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350-3a1ebf1a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_832x832_160k_cityscapes/icnet_r101-d8_832x832_160k_cityscapes_20210926_092350.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.57
+      mIoU(ms+flip): 77.86
+  Config: configs/icnet/icnet_r101-d8-in1k-pre_4xb2-80k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414-7ceb12c5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes/icnet_r101-d8_in1k-pre_832x832_80k_cityscapes_20210926_020414.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
+- Name: icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832
+  In Collection: ICNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.15
+      mIoU(ms+flip): 77.98
+  Config: configs/icnet/icnet_r101-d8-in1k-pre_4xb2-160k_cityscapes-832x832.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ICNet
+    - (in1k-pre)
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612-9484ae8a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/icnet/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes/icnet_r101-d8_in1k-pre_832x832_160k_cityscapes_20210925_232612.log.json
+  Paper:
+    Title: ICNet for Real-time Semantic Segmentation on High-resolution Images
+    URL: https://arxiv.org/abs/1704.08545
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/necks/ic_neck.py#L77
+  Framework: PyTorch
diff --git a/configs/isanet/README.md b/configs/isanet/README.md
index d1c268dae2..c11744ffef 100644
--- a/configs/isanet/README.md
+++ b/configs/isanet/README.md
@@ -1,6 +1,6 @@
 # ISANet
 
-[Interlaced Sparse Self-Attention for Semantic Segmentation](https://arxiv.org/abs/1907.12273)
+> [Interlaced Sparse Self-Attention for Semantic Segmentation](https://arxiv.org/abs/1907.12273)
 
 ## Introduction
 
@@ -22,6 +22,39 @@ In this paper, we present a so-called interlaced sparse self-attention approach
 <img src="https://user-images.githubusercontent.com/24582831/142901868-03d80da4-b9c0-4df9-8509-5f684ba9dadc.png" width="80%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                         config | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | -----------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ISANet | R-50-D8  | 512x1024  | 40000   |    5.869 | 2.91           | V100   | 78.49 |         79.44 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739-981bd763.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739.log.json)     |
+| ISANet | R-50-D8  | 512x1024  | 80000   |    5.869 | 2.91           | V100   | 78.68 |         80.25 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202-89384497.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202.log.json)     |
+| ISANet | R-50-D8  | 769x769   | 40000   |    6.759 | 1.54           | V100   | 78.70 |         80.28 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200-4ae7e65b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200.log.json)         |
+| ISANet | R-50-D8  | 769x769   | 80000   |    6.759 | 1.54           | V100   | 79.29 |         80.53 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126-99b54519.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126.log.json)         |
+| ISANet | R-101-D8 | 512x1024  | 40000   |    9.425 | 2.35           | V100   | 79.58 |         81.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553-293e6bd6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553.log.json) |
+| ISANet | R-101-D8 | 512x1024  | 80000   |    9.425 | 2.35           | V100   | 80.32 |         81.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243-5b99c9b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243.log.json) |
+| ISANet | R-101-D8 | 769x769   | 40000   |   10.815 | 0.92           | V100   | 79.68 |         80.95 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320-509e7224.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320.log.json)     |
+| ISANet | R-101-D8 | 769x769   | 80000   |   10.815 | 0.92           | V100   | 80.61 |         81.59 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319-24f71dfa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                     config | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------: | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ISANet | R-50-D8  | 512x512   | 80000   |      9.0 | 22.55          | V100   | 41.12 |         42.35 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557-6ed83a0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557.log.json)         |
+| ISANet | R-50-D8  | 512x512   | 160000  |      9.0 | 22.55          | V100   | 42.59 |         43.07 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850-f752d0a3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850.log.json)     |
+| ISANet | R-101-D8 | 512x512   | 80000   |   12.562 | 10.56          | V100   | 43.51 |         44.38 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056-68b235c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056.log.json)     |
+| ISANet | R-101-D8 | 512x512   | 160000  |   12.562 | 10.56          | V100   | 43.80 |          45.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431-a7879dcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                      config | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | --------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| ISANet | R-50-D8  | 512x512   | 20000   |      5.9 | 23.08          | V100   | 76.78 |         77.79 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838-79d59b80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838.log.json)     |
+| ISANet | R-50-D8  | 512x512   | 40000   |      5.9 | 23.08          | V100   | 76.20 |         77.22 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349-7d08a54e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349.log.json)     |
+| ISANet | R-101-D8 | 512x512   | 20000   |    9.465 | 7.42           | V100   | 78.46 |         79.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805-3ccbf355.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805.log.json) |
+| ISANet | R-101-D8 | 512x512   | 40000   |    9.465 | 7.42           | V100   | 78.12 |         79.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814-bc71233b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814.log.json) |
+
 ## Citation
 
 ```bibetex
@@ -45,36 +78,3 @@ The technical report above is also presented at:
   publisher={Springer}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                      config | download                                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------- | -------: | -------------- | ----- | ------------: | --------------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ISANet | R-50-D8  | 512x1024  | 40000   |    5.869 | 2.91           | 78.49 |         79.44 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r50-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739-981bd763.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739.log.json)     |
-| ISANet | R-50-D8  | 512x1024  | 80000   |    5.869 | 2.91           | 78.68 |         80.25 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r50-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202-89384497.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202.log.json)     |
-| ISANet | R-50-D8  | 769x769   | 40000   |    6.759 | 1.54           | 78.70 |         80.28 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r50-d8_769x769_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200-4ae7e65b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200.log.json)         |
-| ISANet | R-50-D8  | 769x769   | 80000   |    6.759 | 1.54           | 79.29 |         80.53 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r50-d8_769x769_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126-99b54519.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126.log.json)         |
-| ISANet | R-101-D8 | 512x1024  | 40000   |    9.425 | 2.35           | 79.58 |         81.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553-293e6bd6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553.log.json) |
-| ISANet | R-101-D8 | 512x1024  | 80000   |    9.425 | 2.35           | 80.32 |         81.58 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243-5b99c9b2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243.log.json) |
-| ISANet | R-101-D8 | 769x769   | 40000   |   10.815 | 0.92           | 79.68 |         80.95 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r101-d8_769x769_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320-509e7224.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320.log.json)     |
-| ISANet | R-101-D8 | 769x769   | 80000   |   10.815 | 0.92           | 80.61 |         81.59 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r101-d8_769x769_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319-24f71dfa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319.log.json)     |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                  config | download                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------- | -------: | -------------- | ----- | ------------: | ----------------------------------------------------------------------------------------------------------------------: | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ISANet | R-50-D8  | 512x512   | 80000   |      9.0 | 22.55          | 41.12 |         42.35 |   [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r50-d8_512x512_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557-6ed83a0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557.log.json)         |
-| ISANet | R-50-D8  | 512x512   | 160000  |      9.0 | 22.55          | 42.59 |         43.07 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r50-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850-f752d0a3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850.log.json)     |
-| ISANet | R-101-D8 | 512x512   | 80000   |   12.562 | 10.56          | 43.51 |         44.38 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r101-d8_512x512_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056-68b235c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056.log.json)     |
-| ISANet | R-101-D8 | 512x512   | 160000  |   12.562 | 10.56          | 43.80 |          45.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431-a7879dcd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431.log.json) |
-
-### Pascal VOC 2012 + Aug
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                   config | download                                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -----------------------------------------------------------------------------------------------------------------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| ISANet | R-50-D8  | 512x512   | 20000   |      5.9 | 23.08          | 76.78 |         77.79 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r50-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838-79d59b80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838.log.json)     |
-| ISANet | R-50-D8  | 512x512   | 40000   |      5.9 | 23.08          | 76.20 |         77.22 |  [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r50-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349-7d08a54e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349.log.json)     |
-| ISANet | R-101-D8 | 512x512   | 20000   |    9.465 | 7.42           | 78.46 |         79.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805-3ccbf355.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805.log.json) |
-| ISANet | R-101-D8 | 512x512   | 40000   |    9.465 | 7.42           | 78.12 |         79.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet/isanet_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814-bc71233b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814.log.json) |
diff --git a/configs/isanet/isanet.yml b/configs/isanet/isanet.yml
deleted file mode 100644
index 8c65bcfb05..0000000000
--- a/configs/isanet/isanet.yml
+++ /dev/null
@@ -1,369 +0,0 @@
-Collections:
-- Name: ISANet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://arxiv.org/abs/1907.12273
-    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
-  README: configs/isanet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
-    Version: v0.18.0
-  Converted From:
-    Code: https://github.com/openseg-group/openseg.pytorch
-Models:
-- Name: isanet_r50-d8_512x1024_40k_cityscapes
-  In Collection: ISANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 343.64
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.869
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.49
-      mIoU(ms+flip): 79.44
-  Config: configs/isanet/isanet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739-981bd763.pth
-- Name: isanet_r50-d8_512x1024_80k_cityscapes
-  In Collection: ISANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 343.64
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.869
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.68
-      mIoU(ms+flip): 80.25
-  Config: configs/isanet/isanet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202-89384497.pth
-- Name: isanet_r50-d8_769x769_40k_cityscapes
-  In Collection: ISANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 649.35
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.759
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.7
-      mIoU(ms+flip): 80.28
-  Config: configs/isanet/isanet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200-4ae7e65b.pth
-- Name: isanet_r50-d8_769x769_80k_cityscapes
-  In Collection: ISANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 649.35
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.759
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.29
-      mIoU(ms+flip): 80.53
-  Config: configs/isanet/isanet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126-99b54519.pth
-- Name: isanet_r101-d8_512x1024_40k_cityscapes
-  In Collection: ISANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 425.53
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.425
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.58
-      mIoU(ms+flip): 81.05
-  Config: configs/isanet/isanet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553-293e6bd6.pth
-- Name: isanet_r101-d8_512x1024_80k_cityscapes
-  In Collection: ISANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 425.53
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.425
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.32
-      mIoU(ms+flip): 81.58
-  Config: configs/isanet/isanet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243-5b99c9b2.pth
-- Name: isanet_r101-d8_769x769_40k_cityscapes
-  In Collection: ISANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 1086.96
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.815
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.68
-      mIoU(ms+flip): 80.95
-  Config: configs/isanet/isanet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320-509e7224.pth
-- Name: isanet_r101-d8_769x769_80k_cityscapes
-  In Collection: ISANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 1086.96
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.815
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.61
-      mIoU(ms+flip): 81.59
-  Config: configs/isanet/isanet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319-24f71dfa.pth
-- Name: isanet_r50-d8_512x512_80k_ade20k
-  In Collection: ISANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 44.35
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.12
-      mIoU(ms+flip): 42.35
-  Config: configs/isanet/isanet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557-6ed83a0c.pth
-- Name: isanet_r50-d8_512x512_160k_ade20k
-  In Collection: ISANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 44.35
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.59
-      mIoU(ms+flip): 43.07
-  Config: configs/isanet/isanet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850-f752d0a3.pth
-- Name: isanet_r101-d8_512x512_80k_ade20k
-  In Collection: ISANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 94.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.562
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.51
-      mIoU(ms+flip): 44.38
-  Config: configs/isanet/isanet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056-68b235c2.pth
-- Name: isanet_r101-d8_512x512_160k_ade20k
-  In Collection: ISANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 94.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.562
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.8
-      mIoU(ms+flip): 45.4
-  Config: configs/isanet/isanet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431-a7879dcd.pth
-- Name: isanet_r50-d8_512x512_20k_voc12aug
-  In Collection: ISANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 43.33
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.78
-      mIoU(ms+flip): 77.79
-  Config: configs/isanet/isanet_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838-79d59b80.pth
-- Name: isanet_r50-d8_512x512_40k_voc12aug
-  In Collection: ISANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 43.33
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.2
-      mIoU(ms+flip): 77.22
-  Config: configs/isanet/isanet_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349-7d08a54e.pth
-- Name: isanet_r101-d8_512x512_20k_voc12aug
-  In Collection: ISANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 134.77
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.465
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 78.46
-      mIoU(ms+flip): 79.16
-  Config: configs/isanet/isanet_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805-3ccbf355.pth
-- Name: isanet_r101-d8_512x512_40k_voc12aug
-  In Collection: ISANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 134.77
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.465
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 78.12
-      mIoU(ms+flip): 79.04
-  Config: configs/isanet/isanet_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814-bc71233b.pth
diff --git a/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..6093aeb4f7
--- /dev/null
+++ b/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..dc14c76dfb
--- /dev/null
+++ b/configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..1735f89d41
--- /dev/null
+++ b/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..b1a6371b76
--- /dev/null
+++ b/configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..c2fb09e374
--- /dev/null
+++ b/configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..7c225cfe3a
--- /dev/null
+++ b/configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..5e86ee584f
--- /dev/null
+++ b/configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..090e86f243
--- /dev/null
+++ b/configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './isanet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_512x1024_40k_cityscapes.py b/configs/isanet/isanet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index f5cd8cbb7c..0000000000
--- a/configs/isanet/isanet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './isanet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_512x1024_80k_cityscapes.py b/configs/isanet/isanet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index ebc15cbfec..0000000000
--- a/configs/isanet/isanet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './isanet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_512x512_160k_ade20k.py b/configs/isanet/isanet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 33290100d5..0000000000
--- a/configs/isanet/isanet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './isanet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_512x512_20k_voc12aug.py b/configs/isanet/isanet_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 46fee9155d..0000000000
--- a/configs/isanet/isanet_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './isanet_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_512x512_40k_voc12aug.py b/configs/isanet/isanet_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index 64bd8c1044..0000000000
--- a/configs/isanet/isanet_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './isanet_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_512x512_80k_ade20k.py b/configs/isanet/isanet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 6e13e20ca5..0000000000
--- a/configs/isanet/isanet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './isanet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_769x769_40k_cityscapes.py b/configs/isanet/isanet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index cf362aaacb..0000000000
--- a/configs/isanet/isanet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './isanet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r101-d8_769x769_80k_cityscapes.py b/configs/isanet/isanet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 3c2283bdba..0000000000
--- a/configs/isanet/isanet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './isanet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/isanet/isanet_r50-d8_512x1024_40k_cityscapes.py b/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/isanet/isanet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/isanet/isanet_r50-d8_769x769_40k_cityscapes.py b/configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/isanet/isanet_r50-d8_769x769_40k_cityscapes.py
rename to configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/isanet/isanet_r50-d8_512x1024_80k_cityscapes.py b/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/isanet/isanet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/isanet/isanet_r50-d8_769x769_80k_cityscapes.py b/configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/isanet/isanet_r50-d8_769x769_80k_cityscapes.py
rename to configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/isanet/isanet_r50-d8_512x512_160k_ade20k.py b/configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/isanet/isanet_r50-d8_512x512_160k_ade20k.py
rename to configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/isanet/isanet_r50-d8_512x512_20k_voc12aug.py b/configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/isanet/isanet_r50-d8_512x512_20k_voc12aug.py
rename to configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/isanet/isanet_r50-d8_512x512_40k_voc12aug.py b/configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/isanet/isanet_r50-d8_512x512_40k_voc12aug.py
rename to configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/isanet/isanet_r50-d8_512x512_80k_ade20k.py b/configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/isanet/isanet_r50-d8_512x512_80k_ade20k.py
rename to configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/isanet/metafile.yaml b/configs/isanet/metafile.yaml
new file mode 100644
index 0000000000..ad394eabb2
--- /dev/null
+++ b/configs/isanet/metafile.yaml
@@ -0,0 +1,399 @@
+Collections:
+- Name: ISANet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  README: configs/isanet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: isanet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.49
+      mIoU(ms+flip): 79.44
+  Config: configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.869
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739-981bd763.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_40k_cityscapes/isanet_r50-d8_512x1024_40k_cityscapes_20210901_054739.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.68
+      mIoU(ms+flip): 80.25
+  Config: configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.869
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202-89384497.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x1024_80k_cityscapes/isanet_r50-d8_512x1024_80k_cityscapes_20210901_074202.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.7
+      mIoU(ms+flip): 80.28
+  Config: configs/isanet/isanet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.759
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200-4ae7e65b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_40k_cityscapes/isanet_r50-d8_769x769_40k_cityscapes_20210903_050200.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.29
+      mIoU(ms+flip): 80.53
+  Config: configs/isanet/isanet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.759
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126-99b54519.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_769x769_80k_cityscapes/isanet_r50-d8_769x769_80k_cityscapes_20210903_101126.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.58
+      mIoU(ms+flip): 81.05
+  Config: configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.425
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553-293e6bd6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_40k_cityscapes/isanet_r101-d8_512x1024_40k_cityscapes_20210901_145553.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.32
+      mIoU(ms+flip): 81.58
+  Config: configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.425
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243-5b99c9b2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x1024_80k_cityscapes/isanet_r101-d8_512x1024_80k_cityscapes_20210901_145243.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.68
+      mIoU(ms+flip): 80.95
+  Config: configs/isanet/isanet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.815
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320-509e7224.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_40k_cityscapes/isanet_r101-d8_769x769_40k_cityscapes_20210903_111320.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.61
+      mIoU(ms+flip): 81.59
+  Config: configs/isanet/isanet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.815
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319-24f71dfa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_769x769_80k_cityscapes/isanet_r101-d8_769x769_80k_cityscapes_20210903_111319.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.12
+      mIoU(ms+flip): 42.35
+  Config: configs/isanet/isanet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557-6ed83a0c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_80k_ade20k/isanet_r50-d8_512x512_80k_ade20k_20210903_124557.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.59
+      mIoU(ms+flip): 43.07
+  Config: configs/isanet/isanet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850-f752d0a3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_160k_ade20k/isanet_r50-d8_512x512_160k_ade20k_20210903_104850.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.51
+      mIoU(ms+flip): 44.38
+  Config: configs/isanet/isanet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.562
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056-68b235c2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_80k_ade20k/isanet_r101-d8_512x512_80k_ade20k_20210903_162056.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.8
+      mIoU(ms+flip): 45.4
+  Config: configs/isanet/isanet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.562
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431-a7879dcd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_160k_ade20k/isanet_r101-d8_512x512_160k_ade20k_20210903_211431.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.78
+      mIoU(ms+flip): 77.79
+  Config: configs/isanet/isanet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838-79d59b80.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_20k_voc12aug/isanet_r50-d8_512x512_20k_voc12aug_20210901_164838.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.2
+      mIoU(ms+flip): 77.22
+  Config: configs/isanet/isanet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349-7d08a54e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r50-d8_512x512_40k_voc12aug/isanet_r50-d8_512x512_40k_voc12aug_20210901_151349.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.46
+      mIoU(ms+flip): 79.16
+  Config: configs/isanet/isanet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.465
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805-3ccbf355.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_20k_voc12aug/isanet_r101-d8_512x512_20k_voc12aug_20210901_115805.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
+- Name: isanet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: ISANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.12
+      mIoU(ms+flip): 79.04
+  Config: configs/isanet/isanet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - ISANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.465
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814-bc71233b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/isanet/isanet_r101-d8_512x512_40k_voc12aug/isanet_r101-d8_512x512_40k_voc12aug_20210901_145814.log.json
+  Paper:
+    Title: Interlaced Sparse Self-Attention for Semantic Segmentation
+    URL: https://arxiv.org/abs/1907.12273
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.18.0/mmseg/models/decode_heads/isa_head.py#L58
+  Framework: PyTorch
diff --git a/configs/knet/README.md b/configs/knet/README.md
index cad14a6ea7..1f3f2ae268 100644
--- a/configs/knet/README.md
+++ b/configs/knet/README.md
@@ -1,6 +1,6 @@
 # K-Net
 
-[K-Net: Towards Unified Image Segmentation](https://arxiv.org/abs/2106.14855)
+> [K-Net: Towards Unified Image Segmentation](https://arxiv.org/abs/2106.14855)
 
 ## Introduction
 
@@ -22,29 +22,31 @@ Semantic, instance, and panoptic segmentations have been addressed using differe
 <img src="https://user-images.githubusercontent.com/24582831/157008300-9f40905c-b8e8-4a2a-9593-c1177fa35b2c.png" width="90%"/>
 </div>
 
-```bibtex
-@inproceedings{zhang2021knet,
-    title={{K-Net: Towards} Unified Image Segmentation},
-    author={Wenwei Zhang and Jiangmiao Pang and Kai Chen and Chen Change Loy},
-    year={2021},
-    booktitle={NeurIPS},
-}
-```
-
 ## Results and models
 
 ### ADE20K
 
-| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                         |
-| ---------------- | -------- | --------- | ------- | -------- | -------------- | ----- | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| KNet + FCN       | R-50-D8  | 512x512   | 80000   | 7.01     | 19.24          | 43.60 | 45.12         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751-abcab920.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751.log.json)                         |
-| KNet + PSPNet    | R-50-D8  | 512x512   | 80000   | 6.98     | 20.04          | 44.18 | 45.58         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634-d2c72240.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634.log.json)             |
-| KNet + DeepLabV3 | R-50-D8  | 512x512   | 80000   | 7.42     | 12.10          | 45.06 | 46.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642-00c8fbeb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642.log.json) |
-| KNet + UperNet   | R-50-D8  | 512x512   | 80000   | 7.34     | 17.11          | 43.45 | 44.07         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657-215753b0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657.log.json)         |
-| KNet + UperNet   | Swin-T   | 512x512   | 80000   | 7.57     | 15.56          | 45.84 | 46.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059-7545e1dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059.log.json)         |
-| KNet + UperNet   | Swin-L   | 512x512   | 80000   | 13.5     | 8.29           | 52.05 | 53.24         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559-d8da9a90.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559.log.json)         |
-| KNet + UperNet   | Swin-L   | 640x640   | 80000   | 13.54    | 8.29           | 52.21 | 53.34         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747-8787fc71.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747.log.json)         |
+| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                                                         |
+| ---------------- | -------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | --------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| KNet + FCN       | R-50-D8  | 512x512   | 80000   | 7.01     | 19.24          | V100   | 43.60 | 45.12         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751-abcab920.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751.log.json)                         |
+| KNet + PSPNet    | R-50-D8  | 512x512   | 80000   | 6.98     | 20.04          | V100   | 44.18 | 45.58         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634-d2c72240.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634.log.json)             |
+| KNet + DeepLabV3 | R-50-D8  | 512x512   | 80000   | 7.42     | 12.10          | V100   | 45.06 | 46.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642-00c8fbeb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642.log.json) |
+| KNet + UperNet   | R-50-D8  | 512x512   | 80000   | 7.34     | 17.11          | V100   | 43.45 | 44.07         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657-215753b0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657.log.json)         |
+| KNet + UperNet   | Swin-T   | 512x512   | 80000   | 7.57     | 15.56          | V100   | 45.84 | 46.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059-7545e1dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059.log.json)         |
+| KNet + UperNet   | Swin-L   | 512x512   | 80000   | 13.5     | 8.29           | V100   | 52.05 | 53.24         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559-d8da9a90.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559.log.json)         |
+| KNet + UperNet   | Swin-L   | 640x640   | 80000   | 13.54    | 8.29           | V100   | 52.21 | 53.34         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747-8787fc71.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747.log.json)         |
 
 Note:
 
 - All experiments of K-Net are implemented with 8 V100 (32G) GPUs with 2 samplers per GPU.
+
+# Citation
+
+```bibtex
+@inproceedings{zhang2021knet,
+    title={{K-Net: Towards} Unified Image Segmentation},
+    author={Wenwei Zhang and Jiangmiao Pang and Kai Chen and Chen Change Loy},
+    year={2021},
+    booktitle={NeurIPS},
+}
+```
diff --git a/configs/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k.py b/configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py
similarity index 100%
rename from configs/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k.py
rename to configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py
diff --git a/configs/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k.py b/configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py
similarity index 100%
rename from configs/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k.py
rename to configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py
diff --git a/configs/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k.py b/configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py
similarity index 100%
rename from configs/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k.py
rename to configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py
diff --git a/configs/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k.py b/configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py
similarity index 100%
rename from configs/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k.py
rename to configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py
diff --git a/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py b/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..c6f4eb6ae2
--- /dev/null
+++ b/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py
@@ -0,0 +1,21 @@
+_base_ = 'knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window7_224_22k_20220308-d5bdebaf.pth'  # noqa
+# model settings
+model = dict(
+    pretrained=checkpoint_file,
+    backbone=dict(
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        use_abs_pos_embed=False,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    decode_head=dict(
+        kernel_generate_head=dict(in_channels=[192, 384, 768, 1536])),
+    auxiliary_head=dict(in_channels=768))
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py b/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py
new file mode 100644
index 0000000000..84c3d8cc6a
--- /dev/null
+++ b/configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py
@@ -0,0 +1,57 @@
+_base_ = 'knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window7_224_22k_20220308-d5bdebaf.pth'  # noqa
+# model settings
+crop_size = (640, 640)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    size=crop_size,
+    seg_pad_val=255)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained=checkpoint_file,
+    backbone=dict(
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7,
+        use_abs_pos_embed=False,
+        drop_path_rate=0.4,
+        patch_norm=True),
+    decode_head=dict(
+        kernel_generate_head=dict(in_channels=[192, 384, 768, 1536])),
+    auxiliary_head=dict(in_channels=768))
+
+crop_size = (640, 640)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py b/configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..a7acec4996
--- /dev/null
+++ b/configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py
@@ -0,0 +1,63 @@
+_base_ = 'knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220308-f41b89d3.pth'  # noqa
+
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+num_stages = 3
+conv_kernel_size = 1
+
+model = dict(
+    type='EncoderDecoder',
+    pretrained=checkpoint_file,
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3)),
+    decode_head=dict(
+        kernel_generate_head=dict(in_channels=[96, 192, 384, 768])),
+    auxiliary_head=dict(in_channels=384))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    # modify learning rate following the official implementation of Swin Transformer # noqa
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.0005),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }),
+    clip_grad=dict(max_norm=1, norm_type=2))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
+        end=1000),
+    dict(
+        type='MultiStepLR',
+        begin=1000,
+        end=80000,
+        milestones=[60000, 72000],
+        by_epoch=False,
+    )
+]
+# In K-Net implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/knet/knet.yml b/configs/knet/knet.yml
deleted file mode 100644
index 5e2e529557..0000000000
--- a/configs/knet/knet.yml
+++ /dev/null
@@ -1,169 +0,0 @@
-Collections:
-- Name: KNet
-  Metadata:
-    Training Data:
-    - ADE20K
-  Paper:
-    URL: https://arxiv.org/abs/2106.14855
-    Title: 'K-Net: Towards Unified Image Segmentation'
-  README: configs/knet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
-    Version: v0.23.0
-  Converted From:
-    Code: https://github.com/ZwwWayne/K-Net/
-Models:
-- Name: knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k
-  In Collection: KNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 51.98
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.01
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.6
-      mIoU(ms+flip): 45.12
-  Config: configs/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751-abcab920.pth
-- Name: knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k
-  In Collection: KNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 49.9
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.98
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.18
-      mIoU(ms+flip): 45.58
-  Config: configs/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634-d2c72240.pth
-- Name: knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k
-  In Collection: KNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 82.64
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.42
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.06
-      mIoU(ms+flip): 46.11
-  Config: configs/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642-00c8fbeb.pth
-- Name: knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k
-  In Collection: KNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 58.45
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.34
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.45
-      mIoU(ms+flip): 44.07
-  Config: configs/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657-215753b0.pth
-- Name: knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k
-  In Collection: KNet
-  Metadata:
-    backbone: Swin-T
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 64.27
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.57
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.84
-      mIoU(ms+flip): 46.27
-  Config: configs/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059-7545e1dc.pth
-- Name: knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k
-  In Collection: KNet
-  Metadata:
-    backbone: Swin-L
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 120.63
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 13.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 52.05
-      mIoU(ms+flip): 53.24
-  Config: configs/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559-d8da9a90.pth
-- Name: knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k
-  In Collection: KNet
-  Metadata:
-    backbone: Swin-L
-    crop size: (640,640)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 120.63
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (640,640)
-    Training Memory (GB): 13.54
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 52.21
-      mIoU(ms+flip): 53.34
-  Config: configs/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747-8787fc71.pth
diff --git a/configs/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k.py b/configs/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k.py
deleted file mode 100644
index c27f56b741..0000000000
--- a/configs/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k.py
+++ /dev/null
@@ -1,21 +0,0 @@
-_base_ = 'knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k.py'
-
-checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window7_224_22k_20220308-d5bdebaf.pth'  # noqa
-# model settings
-model = dict(
-    pretrained=checkpoint_file,
-    backbone=dict(
-        embed_dims=192,
-        depths=[2, 2, 18, 2],
-        num_heads=[6, 12, 24, 48],
-        window_size=7,
-        use_abs_pos_embed=False,
-        drop_path_rate=0.3,
-        patch_norm=True),
-    decode_head=dict(
-        kernel_generate_head=dict(in_channels=[192, 384, 768, 1536])),
-    auxiliary_head=dict(in_channels=768))
-# In K-Net implementation we use batch size 2 per GPU as default
-train_dataloader = dict(batch_size=2, num_workers=2)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k.py b/configs/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k.py
deleted file mode 100644
index 1dcb1d4860..0000000000
--- a/configs/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k.py
+++ /dev/null
@@ -1,57 +0,0 @@
-_base_ = 'knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k.py'
-
-checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window7_224_22k_20220308-d5bdebaf.pth'  # noqa
-# model settings
-crop_size = (640, 640)
-data_preprocessor = dict(
-    type='SegDataPreProcessor',
-    mean=[123.675, 116.28, 103.53],
-    std=[58.395, 57.12, 57.375],
-    bgr_to_rgb=True,
-    pad_val=0,
-    size=crop_size,
-    seg_pad_val=255)
-model = dict(
-    data_preprocessor=data_preprocessor,
-    pretrained=checkpoint_file,
-    backbone=dict(
-        embed_dims=192,
-        depths=[2, 2, 18, 2],
-        num_heads=[6, 12, 24, 48],
-        window_size=7,
-        use_abs_pos_embed=False,
-        drop_path_rate=0.4,
-        patch_norm=True),
-    decode_head=dict(
-        kernel_generate_head=dict(in_channels=[192, 384, 768, 1536])),
-    auxiliary_head=dict(in_channels=768))
-
-crop_size = (640, 640)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', reduce_zero_label=True),
-    dict(
-        type='RandomResize',
-        scale=(2048, 640),
-        ratio_range=(0.5, 2.0),
-        keep_ratio=True),
-    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
-    dict(type='RandomFlip', prob=0.5),
-    dict(type='PhotoMetricDistortion'),
-    dict(type='PackSegInputs')
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='Resize', scale=(2048, 640), keep_ratio=True),
-    # add loading annotation after ``Resize`` because ground truth
-    # does not need to do resize data transform
-    dict(type='LoadAnnotations', reduce_zero_label=True),
-    dict(type='PackSegInputs')
-]
-train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
-val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
-test_dataloader = val_dataloader
-# In K-Net implementation we use batch size 2 per GPU as default
-train_dataloader = dict(batch_size=2, num_workers=2)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k.py b/configs/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k.py
deleted file mode 100644
index 78642804b0..0000000000
--- a/configs/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k.py
+++ /dev/null
@@ -1,63 +0,0 @@
-_base_ = 'knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k.py'
-
-checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220308-f41b89d3.pth'  # noqa
-
-# model settings
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-num_stages = 3
-conv_kernel_size = 1
-
-model = dict(
-    type='EncoderDecoder',
-    pretrained=checkpoint_file,
-    backbone=dict(
-        _delete_=True,
-        type='SwinTransformer',
-        embed_dims=96,
-        depths=[2, 2, 6, 2],
-        num_heads=[3, 6, 12, 24],
-        window_size=7,
-        mlp_ratio=4,
-        qkv_bias=True,
-        qk_scale=None,
-        drop_rate=0.,
-        attn_drop_rate=0.,
-        drop_path_rate=0.3,
-        use_abs_pos_embed=False,
-        patch_norm=True,
-        out_indices=(0, 1, 2, 3)),
-    decode_head=dict(
-        kernel_generate_head=dict(in_channels=[96, 192, 384, 768])),
-    auxiliary_head=dict(in_channels=384))
-
-optim_wrapper = dict(
-    _delete_=True,
-    type='OptimWrapper',
-    # modify learning rate following the official implementation of Swin Transformer # noqa
-    optimizer=dict(
-        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.0005),
-    paramwise_cfg=dict(
-        custom_keys={
-            'absolute_pos_embed': dict(decay_mult=0.),
-            'relative_position_bias_table': dict(decay_mult=0.),
-            'norm': dict(decay_mult=0.)
-        }),
-    clip_grad=dict(max_norm=1, norm_type=2))
-
-# learning policy
-param_scheduler = [
-    dict(
-        type='LinearLR', start_factor=0.001, by_epoch=False, begin=0,
-        end=1000),
-    dict(
-        type='MultiStepLR',
-        begin=1000,
-        end=80000,
-        milestones=[60000, 72000],
-        by_epoch=False,
-    )
-]
-# In K-Net implementation we use batch size 2 per GPU as default
-train_dataloader = dict(batch_size=2, num_workers=2)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/knet/metafile.yaml b/configs/knet/metafile.yaml
new file mode 100644
index 0000000000..0f4ab79609
--- /dev/null
+++ b/configs/knet/metafile.yaml
@@ -0,0 +1,188 @@
+Collections:
+- Name: KNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  README: configs/knet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.6
+      mIoU(ms+flip): 45.12
+  Config: configs/knet/knet-s3_r50-d8_fcn_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - KNet
+    - FCN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.01
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751-abcab920.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_fcn_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_043751.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.18
+      mIoU(ms+flip): 45.58
+  Config: configs/knet/knet-s3_r50-d8_pspnet_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - KNet
+    - PSPNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.98
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634-d2c72240.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_pspnet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_054634.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.06
+      mIoU(ms+flip): 46.11
+  Config: configs/knet/knet-s3_r50-d8_deeplabv3_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - KNet
+    - DeepLabV3
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.42
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642-00c8fbeb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_deeplabv3_r50-d8_8x2_512x512_adamw_80k_ade20k_20220228_041642.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.45
+      mIoU(ms+flip): 44.07
+  Config: configs/knet/knet-s3_r50-d8_upernet_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - KNet
+    - UperNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.34
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657-215753b0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_r50-d8_8x2_512x512_adamw_80k_ade20k_20220304_125657.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.84
+      mIoU(ms+flip): 46.27
+  Config: configs/knet/knet-s3_swin-t_upernet_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - KNet
+    - UperNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.57
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059-7545e1dc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-t_8x2_512x512_adamw_80k_ade20k_20220303_133059.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.05
+      mIoU(ms+flip): 53.24
+  Config: configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-L
+    - KNet
+    - UperNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 13.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559-d8da9a90.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_512x512_adamw_80k_ade20k_20220303_154559.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
+- Name: knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640
+  In Collection: KNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.21
+      mIoU(ms+flip): 53.34
+  Config: configs/knet/knet-s3_swin-l_upernet_8xb2-adamw-80k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-L
+    - KNet
+    - UperNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 13.54
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747-8787fc71.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/knet/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k/knet_s3_upernet_swin-l_8x2_640x640_adamw_80k_ade20k_20220301_220747.log.json
+  Paper:
+    Title: 'K-Net: Towards Unified Image Segmentation'
+    URL: https://arxiv.org/abs/2106.14855
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.23.0/mmseg/models/decode_heads/knet_head.py#L392
+  Framework: PyTorch
diff --git a/configs/mae/README.md b/configs/mae/README.md
index 562f6f8bf0..d14e3830be 100644
--- a/configs/mae/README.md
+++ b/configs/mae/README.md
@@ -1,6 +1,6 @@
 # MAE
 
-[Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
+> [Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
 
 ## Introduction
 
@@ -22,17 +22,6 @@ This paper shows that masked autoencoders (MAE) are scalable self-supervised lea
 <img src="https://user-images.githubusercontent.com/24582831/165456416-1cba54bf-b1b5-4bdf-ad86-d6390de7f342.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{he2021masked,
-  title={Masked autoencoders are scalable vision learners},
-  author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
-  journal={arXiv preprint arXiv:2111.06377},
-  year={2021}
-}
-```
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
@@ -77,6 +66,17 @@ upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth $GPUS
 
 ### ADE20K
 
-| Method  | Backbone | Crop Size | pretrain    | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                       |
-| ------- | -------- | --------- | ----------- | ----------------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| UPerNet | ViT-B    | 512x512   | ImageNet-1K | 224x224           | 16         | 160000  | 9.96     | 7.14           | 48.13 |         48.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752.log.json) |
+| Method  | Backbone | Crop Size | pretrain    | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                       |
+| ------- | -------- | --------- | ----------- | ----------------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| UPerNet | ViT-B    | 512x512   | ImageNet-1K | 224x224           | 16         | 160000  | 9.96     | 7.14           | V100   | 48.13 |         48.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752.log.json) |
+
+## Citation
+
+```bibtex
+@article{he2021masked,
+  title={Masked autoencoders are scalable vision learners},
+  author={He, Kaiming and Chen, Xinlei and Xie, Saining and Li, Yanghao and Doll{\'a}r, Piotr and Girshick, Ross},
+  journal={arXiv preprint arXiv:2111.06377},
+  year={2021}
+}
+```
diff --git a/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512-ms.py b/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512-ms.py
new file mode 100644
index 0000000000..ec32fea54b
--- /dev/null
+++ b/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512-ms.py
@@ -0,0 +1,16 @@
+_base_ = './mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    # TODO: Refactor 'MultiScaleFlipAug' which supports
+    # `min_size` feature in `Resize` class
+    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+    # original image scale is (2048, 512)
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/configs/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k.py b/configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py
similarity index 100%
rename from configs/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k.py
rename to configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py
diff --git a/configs/mae/mae.yml b/configs/mae/mae.yml
deleted file mode 100644
index d78f99c86c..0000000000
--- a/configs/mae/mae.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-Models:
-- Name: upernet_mae-base_fp16_8x2_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ViT-B
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 140.06
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (512,512)
-    Training Memory (GB): 9.96
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 48.13
-      mIoU(ms+flip): 48.7
-  Config: configs/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth
diff --git a/configs/mae/metafile.yaml b/configs/mae/metafile.yaml
new file mode 100644
index 0000000000..567eafe131
--- /dev/null
+++ b/configs/mae/metafile.yaml
@@ -0,0 +1,25 @@
+Models:
+- Name: mae-base_upernet_8xb2-amp-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.13
+      mIoU(ms+flip): 48.7
+  Config: configs/mae/mae-base_upernet_8xb2-amp-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.96
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752-f92a2975.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mae/upernet_mae-base_fp16_8x2_512x512_160k_ade20k/upernet_mae-base_fp16_8x2_512x512_160k_ade20k_20220426_174752.log.json
+  Paper:
+    Title: Masked Autoencoders Are Scalable Vision Learners
+    URL: https://arxiv.org/abs/2111.06377
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.24.0/mmseg/models/backbones/mae.py#L46
+  Framework: PyTorch
diff --git a/configs/mae/upernet_mae-base_fp16_512x512_160k_ade20k_ms.py b/configs/mae/upernet_mae-base_fp16_512x512_160k_ade20k_ms.py
deleted file mode 100644
index 81b913f6fd..0000000000
--- a/configs/mae/upernet_mae-base_fp16_512x512_160k_ade20k_ms.py
+++ /dev/null
@@ -1,16 +0,0 @@
-_base_ = './upernet_mae-base_fp16_8x2_512x512_160k_ade20k.py'
-
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    # TODO: Refactor 'MultiScaleFlipAug' which supports
-    # `min_size` feature in `Resize` class
-    # img_ratios is [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
-    # original image scale is (2048, 512)
-    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
-    # add loading annotation after ``Resize`` because ground truth
-    # does not need to do resize data transform
-    dict(type='LoadAnnotations', reduce_zero_label=True),
-    dict(type='PackSegInputs')
-]
-val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
-test_dataloader = val_dataloader
diff --git a/configs/mask2former/README.md b/configs/mask2former/README.md
new file mode 100644
index 0000000000..c21ab0d0c6
--- /dev/null
+++ b/configs/mask2former/README.md
@@ -0,0 +1,74 @@
+# Mask2Former
+
+> [Masked-attention Mask Transformer for Universal Image Segmentation](https://arxiv.org/abs/2112.01527)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/Mask2Former">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Image segmentation is about grouping pixels with different semantics, e.g., category or instance membership, where each choice of semantics defines a task. While only the semantics of each task differ, current research focuses on designing specialized architectures for each task. We present Masked-attention Mask Transformer (Mask2Former), a new architecture capable of addressing any image segmentation task (panoptic, instance or semantic). Its key components include masked attention, which extracts localized features by constraining cross-attention within predicted mask regions. In addition to reducing the research effort by at least three times, it outperforms the best specialized architectures by a significant margin on four popular datasets. Most notably, Mask2Former sets a new state-of-the-art for panoptic segmentation (57.8 PQ on COCO), instance segmentation (50.1 AP on COCO) and semantic segmentation (57.7 mIoU on ADE20K).
+
+### Usage
+
+- Mask2Former model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
+
+```shell
+pip install "mmdet>=3.0.0rc4"
+```
+
+## Results and models
+
+### Cityscapes
+
+| Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                                                    config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | -------------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Mask2Former | R-50-D32       | 512x1024  | 90000   |     5.67 | 9.17           | A100   | 80.44 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-ffd9d750.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json)                                                                                      |
+| Mask2Former | R-101-D32      | 512x1024  | 90000   |     6.81 | 7.11           | A100   | 80.80 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-43e68666.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x1024  | 90000   |     6.36 | 7.18           | A100   | 81.71 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-36c59341.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json))                                                                         |
+| Mask2Former | Swin-S         | 512x1024  | 90000   |     8.09 | 5.57           | A100   | 82.57 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-9ab177f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json))                                                                         |
+| Mask2Former | Swin-B (in22k) | 512x1024  | 90000   |    10.89 | 4.32           | A100   | 83.52 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-9a86a225.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
+| Mask2Former | Swin-L (in22k) | 512x1024  | 90000   |    15.83 | 2.86           | A100   | 83.65 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-28ad20f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |
+
+### ADE20K
+
+| Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) |                                                                                                                                                config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
+| ----------- | -------------- | --------- | ------- | -------: | -------------- | ------ | ----- | ------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Mask2Former | R-50-D32       | 512x512   | 160000  |     3.31 | 26.59          | A100   | 47.87 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-2d1f55f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json))                                                                                     |
+| Mask2Former | R-101-D32      | 512x512   | 160000  |     4.09 | 22.97          | A100   | 48.60 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b7135890.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x512   | 160000  |     3826 | 23.82          | A100   | 48.66 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-7d64e5dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json))                                                                         |
+| Mask2Former | Swin-S         | 512x512   | 160000  |     3.74 | 19.69          | A100   | 51.24 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-e715144e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json))                                                                         |
+| Mask2Former | Swin-B         | 640x640   | 160000  |     5.66 | 12.48          | A100   | 52.44 |             - |  [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-a4a086d2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json))     |
+| Mask2Former | Swin-B (in22k) | 640x640   | 160000  |     5.66 | 12.43          | A100   | 53.90 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-7ec0f569.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
+| Mask2Former | Swin-L (in22k) | 640x640   | 160000  |     8.86 | 8.81           | A100   | 56.01 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-7120c214.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |
+
+Note:
+
+- All experiments of Mask2Former are implemented with 8 A100 GPUs with 2 samplers per GPU.
+- As mentioned at [the official repo](https://github.com/facebookresearch/Mask2Former/issues/5), the results of Mask2Former are relatively not stable, the result of Mask2Former(swin-s) on ADE20K dataset in the table is the medium result obtained by training 5 times following the suggestion of the author.
+- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
+- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.
+
+## Citation
+
+```bibtex
+@inproceedings{cheng2021mask2former,
+  title={Masked-attention Mask Transformer for Universal Image Segmentation},
+  author={Bowen Cheng and Ishan Misra and Alexander G. Schwing and Alexander Kirillov and Rohit Girdhar},
+  journal={CVPR},
+  year={2022}
+}
+@inproceedings{cheng2021maskformer,
+  title={Per-Pixel Classification is Not All You Need for Semantic Segmentation},
+  author={Bowen Cheng and Alexander G. Schwing and Alexander Kirillov},
+  journal={NeurIPS},
+  year={2021}
+}
+```
diff --git a/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..48f6c12d13
--- /dev/null
+++ b/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..275a7dab52
--- /dev/null
+++ b/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,7 @@
+_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..78cf60510c
--- /dev/null
+++ b/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,200 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/ade20k.py']
+
+custom_imports = dict(imports='mmdet.models', allow_failed_imports=False)
+
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=32))
+num_classes = 150
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        deep_stem=False,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(512 * x * 0.1) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=2048),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..d2211b66a3
--- /dev/null
+++ b/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,197 @@
+_base_ = ['../_base_/default_runtime.py', '../_base_/datasets/cityscapes.py']
+
+crop_size = (512, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    test_cfg=dict(size_divisor=32))
+num_classes = 19
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        deep_stem=False,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        frozen_stages=-1,
+        norm_cfg=dict(type='SyncBN', requires_grad=False),
+        style='pytorch',
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[256, 512, 1024, 2048],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(1024 * x * 0.1) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=4096),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+
+# optimizer
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+            'query_embed': embed_multi,
+            'query_feat': embed_multi,
+            'level_embed': embed_multi,
+        },
+        norm_decay_mult=0.0))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=90000,
+        by_epoch=False)
+]
+
+# training schedule for 90k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=90000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py b/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000..b8b1d6cfff
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,229 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/datasets/ade20k_640x640.py'
+]
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth'  # noqa
+
+crop_size = (640, 640)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+num_classes = 150
+
+depths = [2, 2, 18, 2]
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='SwinTransformer',
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(
+        type='Mask2FormerHead',
+        in_channels=[128, 256, 512, 1024],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset config
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(x * 0.1 * 640) for x in range(5, 21)],
+        resize_type='ResizeShortestEdge',
+        max_size=2560),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=2, dataset=dict(pipeline=train_pipeline))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, weight_decay=0.05, eps=1e-8, betas=(0.9, 0.999))
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# training schedule for 160k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=160000, val_interval=5000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=5000,
+        save_best='mIoU'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (2 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=16)
diff --git a/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000..f39a3c5906
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,5 @@
+_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+model = dict(
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=pretrained)))
diff --git a/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..0c229c145d
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=depths,
+        num_heads=[4, 8, 16, 32],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[128, 256, 512, 1024]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000..f2657e8842
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,9 @@
+_base_ = ['./mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        embed_dims=192,
+        num_heads=[6, 12, 24, 48],
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(num_queries=100, in_channels=[192, 384, 768, 1536]))
diff --git a/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..01a7b9988f
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,42 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        pretrain_img_size=384,
+        embed_dims=192,
+        depths=depths,
+        num_heads=[6, 12, 24, 48],
+        window_size=12,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[192, 384, 768, 1536]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..a7796d5693
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t_8xb2-160k_ade20k-512x512.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..5f75544b1a
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,37 @@
+_base_ = ['./mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        depths=depths, init_cfg=dict(type='Pretrained',
+                                     checkpoint=pretrained)))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..9de3d242eb
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,52 @@
+_base_ = ['./mask2former_r50_8xb2-160k_ade20k-512x512.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[96, 192, 384, 768]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..0abda6430c
--- /dev/null
+++ b/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
@@ -0,0 +1,52 @@
+_base_ = ['./mask2former_r50_8xb2-90k_cityscapes-512x1024.py']
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        embed_dims=96,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        mlp_ratio=4,
+        qkv_bias=True,
+        qk_scale=None,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        patch_norm=True,
+        out_indices=(0, 1, 2, 3),
+        with_cp=False,
+        frozen_stages=-1,
+        init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+    decode_head=dict(in_channels=[96, 192, 384, 768]))
+
+# set all layers in backbone to lr_mult=0.1
+# set all norm layers, position_embeding,
+# query_embeding, level_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=0.1, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=0.1, decay_mult=0.0)
+embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=0.1, decay_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'absolute_pos_embed': backbone_embed_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+    'query_feat': embed_multi,
+    'level_embed': embed_multi
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    paramwise_cfg=dict(custom_keys=custom_keys, norm_decay_mult=0.0))
diff --git a/configs/mask2former/metafile.yaml b/configs/mask2former/metafile.yaml
new file mode 100644
index 0000000000..090c95e7cf
--- /dev/null
+++ b/configs/mask2former/metafile.yaml
@@ -0,0 +1,314 @@
+Collections:
+- Name: Mask2Former
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Usage
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  README: configs/mask2former/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: mask2former_r50_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.44
+  Config: configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 5.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-ffd9d750.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_r101_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.8
+  Config: configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 6.81
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-43e68666.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-t_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.71
+  Config: configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 6.36
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-36c59341.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-s_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 82.57
+  Config: configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - Swin-S
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 8.09
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-9ab177f6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 83.52
+  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 10.89
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-9a86a225.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 83.65
+  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - Swin-L
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 15.83
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-28ad20f1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_r50_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.87
+  Config: configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 3.31
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-2d1f55f1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_r101_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.6
+  Config: configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 4.09
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b7135890.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-t_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.66
+  Config: configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 3826.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-7d64e5dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-s_8xb2-160k_ade20k-512x512
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 51.24
+  Config: configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-S
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 3.74
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-e715144e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.44
+  Config: configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 5.66
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-a4a086d2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 53.9
+  Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 5.66
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-7ec0f569.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
+- Name: mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640
+  In Collection: Mask2Former
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 56.01
+  Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-L
+    - Mask2Former
+    Training Resources: 8x A100 GPUS
+    Memory (GB): 8.86
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-7120c214.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json
+  Paper:
+    Title: Masked-attention Mask Transformer for Universal Image Segmentation
+    URL: https://arxiv.org/abs/2112.01527
+  Code: https://github.com/open-mmlab/mmdetection/blob/3.x/mmdet/models/dense_heads/mask2former_head.py
+  Framework: PyTorch
diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
new file mode 100644
index 0000000000..a899bac090
--- /dev/null
+++ b/configs/maskformer/README.md
@@ -0,0 +1,62 @@
+# MaskFormer
+
+> [MaskFormer: Per-Pixel Classification is Not All You Need for Semantic Segmentation](https://arxiv.org/abs/2107.06278)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/facebookresearch/MaskFormer/">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Modern approaches typically formulate semantic segmentation as a per-pixel classification task, while instance-level segmentation is handled with an alternative mask classification. Our key insight: mask classification is sufficiently general to solve both semantic- and instance-level segmentation tasks in a unified manner using the exact same model, loss, and training procedure. Following this observation, we propose MaskFormer, a simple mask classification model which predicts a set of binary masks, each associated with a single global class label prediction. Overall, the proposed mask classification-based method simplifies the landscape of effective approaches to semantic and panoptic segmentation tasks and shows excellent empirical results. In particular, we observe that MaskFormer outperforms per-pixel classification baselines when the number of classes is large. Our mask classification-based method outperforms both current state-of-the-art semantic (55.6 mIoU on ADE20K) and panoptic segmentation (52.7 PQ on COCO) models.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/199215459-ea507126-aafe-4823-8eb1-ae6487509d5c.png" width="90%"/>
+</div>
+
+### Usage
+
+- MaskFormer model needs to install [MMDetection](https://github.com/open-mmlab/mmdetection) first.
+
+```shell
+pip install "mmdet>=3.0.0rc4"
+```
+
+## Results and models
+
+### ADE20K
+
+| Method     | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                                                                                     |
+| ---------- | --------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| MaskFormer | R-50-D32  | 512x512   | 160000  | 3.29     | A100           | 42.20  | 44.29 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-3a9cfe45.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json)                             |
+| MaskFormer | R-101-D32 | 512x512   | 160000  | 4.12     | A100           | 34.90  | 45.11 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-84adbfcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json)                         |
+| MaskFormer | Swin-T    | 512x512   | 160000  | 3.73     | A100           | 40.53  | 46.69 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-f14e7ce0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
+| MaskFormer | Swin-S    | 512x512   | 160000  | 5.33     | A100           | 26.98  | 49.36 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-723512c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |
+
+Note:
+
+- All experiments of MaskFormer are implemented with 8 V100 (32G) GPUs with 2 samplers per GPU.
+- The results of MaskFormer are relatively not stable.  The accuracy (mIoU) of model with `R-101-D32` is from 44.7 to 46.0, and with `Swin-S` is from 49.0 to 49.8.
+- The ResNet backbones utilized in MaskFormer models are standard `ResNet` rather than `ResNetV1c`.
+- Test time augmentation is not supported in MMSegmentation 1.x version yet, we would add "ms+flip" results as soon as possible.
+
+## Citation
+
+```bibtex
+@article{cheng2021per,
+  title={Per-pixel classification is not all you need for semantic segmentation},
+  author={Cheng, Bowen and Schwing, Alex and Kirillov, Alexander},
+  journal={Advances in Neural Information Processing Systems},
+  volume={34},
+  pages={17864--17875},
+  year={2021}
+}
+```
diff --git a/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..04bd37546a
--- /dev/null
+++ b/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    backbone=dict(
+        depth=101,
+        init_cfg=dict(type='Pretrained',
+                      checkpoint='torchvision://resnet101')))
diff --git a/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..2a83746171
--- /dev/null
+++ b/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,141 @@
+_base_ = [
+    '../_base_/datasets/ade20k.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+# model_cfg
+num_classes = 150
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='ResNet',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 1, 1),
+        strides=(1, 2, 2, 2),
+        norm_cfg=norm_cfg,
+        norm_eval=True,
+        style='pytorch',
+        contract_dilation=True,
+        init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50')),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[256, 512, 1024,
+                     2048],  # input channels of pixel_decoder modules
+        feat_channels=256,
+        in_index=[0, 1, 2, 3],
+        num_classes=150,
+        out_channels=256,
+        num_queries=100,
+        pixel_decoder=dict(
+            type='mmdet.PixelDecoder',
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU')),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # DetrTransformerDecoder
+            return_intermediate=True,
+            num_layers=6,
+            layer_cfg=dict(  # DetrTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.1,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=20.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=1.0),
+        train_cfg=dict(
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=1.0),
+                    dict(
+                        type='mmdet.FocalLossCost',
+                        weight=20.0,
+                        binary_input=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=1.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler'))),
+    # training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'),
+)
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001)
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys={
+        'backbone': dict(lr_mult=0.1),
+    }))
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=160000,
+        by_epoch=False)
+]
+
+# In MaskFormer implementation we use batch size 2 per GPU as default
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..2cbc038ac2
--- /dev/null
+++ b/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,79 @@
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+depths = [2, 2, 18, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[96, 192, 384,
+                     768],  # input channels of pixel_decoder modules
+    ))
+
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+# set all layers in backbone to lr_mult=1.0
+# set all norm layers, position_embeding,
+# query_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+embed_multi = dict(decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..aa242dbe31
--- /dev/null
+++ b/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,81 @@
+_base_ = './maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'
+
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_tiny_patch4_window7_224_20220317-1cdeb081.pth'  # noqa
+backbone_norm_cfg = dict(type='LN', requires_grad=True)
+depths = [2, 2, 6, 2]
+model = dict(
+    backbone=dict(
+        _delete_=True,
+        type='SwinTransformer',
+        pretrain_img_size=224,
+        embed_dims=96,
+        patch_size=4,
+        window_size=7,
+        mlp_ratio=4,
+        depths=depths,
+        num_heads=[3, 6, 12, 24],
+        strides=(4, 2, 2, 2),
+        out_indices=(0, 1, 2, 3),
+        qkv_bias=True,
+        qk_scale=None,
+        patch_norm=True,
+        drop_rate=0.,
+        attn_drop_rate=0.,
+        drop_path_rate=0.3,
+        use_abs_pos_embed=False,
+        act_cfg=dict(type='GELU'),
+        norm_cfg=backbone_norm_cfg,
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='MaskFormerHead',
+        in_channels=[96, 192, 384,
+                     768],  # input channels of pixel_decoder modules
+    ))
+
+# optimizer
+optimizer = dict(
+    type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+
+# set all layers in backbone to lr_mult=1.0
+# set all norm layers, position_embeding,
+# query_embeding to decay_multi=0.0
+backbone_norm_multi = dict(lr_mult=1.0, decay_mult=0.0)
+backbone_embed_multi = dict(lr_mult=1.0, decay_mult=0.0)
+embed_multi = dict(decay_mult=0.0)
+custom_keys = {
+    'backbone': dict(lr_mult=1.0),
+    'backbone.patch_embed.norm': backbone_norm_multi,
+    'backbone.norm': backbone_norm_multi,
+    'relative_position_bias_table': backbone_embed_multi,
+    'query_embed': embed_multi,
+}
+custom_keys.update({
+    f'backbone.stages.{stage_id}.blocks.{block_id}.norm': backbone_norm_multi
+    for stage_id, num_blocks in enumerate(depths)
+    for block_id in range(num_blocks)
+})
+custom_keys.update({
+    f'backbone.stages.{stage_id}.downsample.norm': backbone_norm_multi
+    for stage_id in range(len(depths) - 1)
+})
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=optimizer,
+    clip_grad=dict(max_norm=0.01, norm_type=2),
+    paramwise_cfg=dict(custom_keys=custom_keys))
+
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/configs/maskformer/metafile.yaml b/configs/maskformer/metafile.yaml
new file mode 100644
index 0000000000..c9853e131f
--- /dev/null
+++ b/configs/maskformer/metafile.yaml
@@ -0,0 +1,111 @@
+Collections:
+- Name: MaskFormer
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Usage
+    - ADE20K
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  README: configs/maskformer/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: maskformer_r50-d32_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.29
+  Config: configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D32
+    - MaskFormer
+    Training Resources: 8x 42.20 GPUS
+    Memory (GB): 3.29
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-3a9cfe45.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  Code: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+  Framework: PyTorch
+- Name: maskformer_r101-d32_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.11
+  Config: configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D32
+    - MaskFormer
+    Training Resources: 8x 34.90 GPUS
+    Memory (GB): 4.12
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-84adbfcb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  Code: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+  Framework: PyTorch
+- Name: maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.69
+  Config: configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - MaskFormer
+    Training Resources: 8x 40.53 GPUS
+    Memory (GB): 3.73
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-f14e7ce0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  Code: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+  Framework: PyTorch
+- Name: maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512
+  In Collection: MaskFormer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.36
+  Config: configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-S
+    - MaskFormer
+    Training Resources: 8x 26.98 GPUS
+    Memory (GB): 5.33
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-723512c7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json
+  Paper:
+    Title: 'MaskFormer: Per-Pixel Classification is Not All You Need for Semantic
+      Segmentation'
+    URL: https://arxiv.org/abs/2107.06278
+  Code: https://github.com/open-mmlab/mmdetection/blob/dev-3.x/mmdet/models/dense_heads/maskformer_head.py#L21
+  Framework: PyTorch
diff --git a/configs/mobilenet_v2/README.md b/configs/mobilenet_v2/README.md
index 3ea8a463ae..bff5259129 100644
--- a/configs/mobilenet_v2/README.md
+++ b/configs/mobilenet_v2/README.md
@@ -1,6 +1,6 @@
 # MobileNetV2
 
-[MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
+> [MobileNetV2: Inverted Residuals and Linear Bottlenecks](https://arxiv.org/abs/1801.04381)
 
 ## Introduction
 
@@ -23,6 +23,26 @@ The MobileNetV2 architecture is based on an inverted residual structure where th
 <img src="https://user-images.githubusercontent.com/24582831/142901935-fa22700e-4b77-477f-90b9-334a4197506f.png" width="50%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                                 |
+| ---------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | M-V2-D8  | 512x1024  |   80000 |      3.4 | 14.2           | A100   | 71.19 | 73.34         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024-20230224_185436-13fef4ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024_20230224_185436.json) |
+| PSPNet     | M-V2-D8  | 512x1024  |   80000 |      3.6 | 11.2           | V100   | 70.23 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes_20200825_124817-19e81d51.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes-20200825_124817.log.json)                                     |
+| DeepLabV3  | M-V2-D8  | 512x1024  |   80000 |      3.9 | 8.4            | V100   | 73.84 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-bef03590.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json)                         |
+| DeepLabV3+ | M-V2-D8  | 512x1024  |   80000 |      5.1 | 8.4            | V100   | 75.20 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-d256dd4b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json)         |
+
+### ADE20K
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                         |
+| ---------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | M-V2-D8  | 512x512   |  160000 |      6.5 | 64.4           | V100   | 19.71 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k_20200825_214953-c40e1095.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json)                                         |
+| PSPNet     | M-V2-D8  | 512x512   |  160000 |      6.5 | 57.7           | V100   | 29.68 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k_20200825_214953-f5942f7a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json)                             |
+| DeepLabV3  | M-V2-D8  | 512x512   |  160000 |      6.8 | 39.9           | V100   | 34.08 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k_20200825_223255-63986343.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json)                 |
+| DeepLabV3+ | M-V2-D8  | 512x512   |  160000 |      8.2 | 43.1           | V100   | 34.02 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k_20200825_223255-465a01d4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -34,23 +54,3 @@ The MobileNetV2 architecture is based on an inverted residual structure where th
   year={2018}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                         |
-| ---------- | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FCN        | M-V2-D8  | 512x1024  |   80000 |      3.4 | 14.2           | 61.54 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2/fcn_m-v2-d8_512x1024_80k_cityscapes.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x1024_80k_cityscapes/fcn_m-v2-d8_512x1024_80k_cityscapes_20200825_124817-d24c28c1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x1024_80k_cityscapes/fcn_m-v2-d8_512x1024_80k_cityscapes-20200825_124817.log.json)                                         |
-| PSPNet     | M-V2-D8  | 512x1024  |   80000 |      3.6 | 11.2           | 70.23 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes_20200825_124817-19e81d51.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes-20200825_124817.log.json)                             |
-| DeepLabV3  | M-V2-D8  | 512x1024  |   80000 |      3.9 | 8.4            | 73.84 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-bef03590.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json)                 |
-| DeepLabV3+ | M-V2-D8  | 512x1024  |   80000 |      5.1 | 8.4            | 75.20 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-d256dd4b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json) |
-
-### ADE20K
-
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                               | download                                                                                                                                                                                                                                                                                                                                                                         |
-| ---------- | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN        | M-V2-D8  | 512x512   |  160000 |      6.5 | 64.4           | 19.71 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k_20200825_214953-c40e1095.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json)                                         |
-| PSPNet     | M-V2-D8  | 512x512   |  160000 |      6.5 | 57.7           | 29.68 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k_20200825_214953-f5942f7a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json)                             |
-| DeepLabV3  | M-V2-D8  | 512x512   |  160000 |      6.8 | 39.9           | 34.08 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k_20200825_223255-63986343.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json)                 |
-| DeepLabV3+ | M-V2-D8  | 512x512   |  160000 |      8.2 | 43.1           | 34.02 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k_20200825_223255-465a01d4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json) |
diff --git a/configs/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes.py b/configs/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 267483d88f..0000000000
--- a/configs/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = '../deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='mmcls://mobilenet_v2',
-    backbone=dict(
-        _delete_=True,
-        type='MobileNetV2',
-        widen_factor=1.,
-        strides=(1, 2, 2, 1, 1, 1, 1),
-        dilations=(1, 1, 1, 2, 2, 4, 4),
-        out_indices=(1, 2, 4, 6)),
-    decode_head=dict(in_channels=320),
-    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k.py b/configs/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k.py
deleted file mode 100644
index e15b8cc82b..0000000000
--- a/configs/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = '../deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='mmcls://mobilenet_v2',
-    backbone=dict(
-        _delete_=True,
-        type='MobileNetV2',
-        widen_factor=1.,
-        strides=(1, 2, 2, 1, 1, 1, 1),
-        dilations=(1, 1, 1, 2, 2, 4, 4),
-        out_indices=(1, 2, 4, 6)),
-    decode_head=dict(in_channels=320),
-    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes.py b/configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index d4533d79a2..0000000000
--- a/configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='mmcls://mobilenet_v2',
-    backbone=dict(
-        _delete_=True,
-        type='MobileNetV2',
-        widen_factor=1.,
-        strides=(1, 2, 2, 1, 1, 1, 1),
-        dilations=(1, 1, 1, 2, 2, 4, 4),
-        out_indices=(1, 2, 4, 6)),
-    decode_head=dict(in_channels=320, c1_in_channels=24),
-    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k.py b/configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 7615a7c19a..0000000000
--- a/configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='mmcls://mobilenet_v2',
-    backbone=dict(
-        _delete_=True,
-        type='MobileNetV2',
-        widen_factor=1.,
-        strides=(1, 2, 2, 1, 1, 1, 1),
-        dilations=(1, 1, 1, 2, 2, 4, 4),
-        out_indices=(1, 2, 4, 6)),
-    decode_head=dict(in_channels=320, c1_in_channels=24),
-    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/fcn_m-v2-d8_512x1024_80k_cityscapes.py b/configs/mobilenet_v2/fcn_m-v2-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index a535bd0ed8..0000000000
--- a/configs/mobilenet_v2/fcn_m-v2-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = '../fcn/fcn_r101-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='mmcls://mobilenet_v2',
-    backbone=dict(
-        _delete_=True,
-        type='MobileNetV2',
-        widen_factor=1.,
-        strides=(1, 2, 2, 1, 1, 1, 1),
-        dilations=(1, 1, 1, 2, 2, 4, 4),
-        out_indices=(1, 2, 4, 6)),
-    decode_head=dict(in_channels=320),
-    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k.py b/configs/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k.py
deleted file mode 100644
index c5f6ab0d62..0000000000
--- a/configs/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = '../fcn/fcn_r101-d8_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='mmcls://mobilenet_v2',
-    backbone=dict(
-        _delete_=True,
-        type='MobileNetV2',
-        widen_factor=1.,
-        strides=(1, 2, 2, 1, 1, 1, 1),
-        dilations=(1, 1, 1, 2, 2, 4, 4),
-        out_indices=(1, 2, 4, 6)),
-    decode_head=dict(in_channels=320),
-    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/metafile.yaml b/configs/mobilenet_v2/metafile.yaml
new file mode 100644
index 0000000000..119c9ae7d6
--- /dev/null
+++ b/configs/mobilenet_v2/metafile.yaml
@@ -0,0 +1,186 @@
+Models:
+- Name: mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.19
+      mIoU(ms+flip): 73.34
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - M-V2-D8
+    - FCN
+    Training Resources: 4x A100 GPUS
+    Memory (GB): 3.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024-20230224_185436-13fef4ea.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024_20230224_185436.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 70.23
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - M-V2-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes_20200825_124817-19e81d51.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes-20200825_124817.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.84
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - M-V2-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-bef03590.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.2
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - M-V2-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-d256dd4b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes-20200825_124836.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 19.71
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - M-V2-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k_20200825_214953-c40e1095.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 29.68
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - M-V2-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k_20200825_214953-f5942f7a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k-20200825_214953.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 34.08
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - M-V2-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k_20200825_223255-63986343.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
+- Name: mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 34.02
+  Config: configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - M-V2-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k_20200825_223255-465a01d4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k-20200825_223255.log.json
+  Paper:
+    Title: 'MobileNetV2: Inverted Residuals and Linear Bottlenecks'
+    URL: https://arxiv.org/abs/1801.04381
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v2.py#L14
+  Framework: PyTorch
diff --git a/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py b/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..ece9b0bf8f
--- /dev/null
+++ b/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,13 @@
+_base_ = '../deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py b/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..86eec0d948
--- /dev/null
+++ b/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = '../deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py b/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..195046edc4
--- /dev/null
+++ b/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+]
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320, c1_in_channels=24),
+    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py b/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..d4f669f163
--- /dev/null
+++ b/configs/mobilenet_v2/mobilenet-v2-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320, c1_in_channels=24),
+    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py b/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..0829f438a7
--- /dev/null
+++ b/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,13 @@
+_base_ = '../fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py b/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..015fa6f201
--- /dev/null
+++ b/configs/mobilenet_v2/mobilenet-v2-d8_fcn_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = '../fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py b/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..8542e02669
--- /dev/null
+++ b/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,13 @@
+_base_ = '../pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py b/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..73db59beae
--- /dev/null
+++ b/configs/mobilenet_v2/mobilenet-v2-d8_pspnet_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = '../pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='mmcls://mobilenet_v2',
+    backbone=dict(
+        _delete_=True,
+        type='MobileNetV2',
+        widen_factor=1.,
+        strides=(1, 2, 2, 1, 1, 1, 1),
+        dilations=(1, 1, 1, 2, 2, 4, 4),
+        out_indices=(1, 2, 4, 6),
+        norm_cfg=dict(type='SyncBN', requires_grad=True)),
+    decode_head=dict(in_channels=320),
+    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/mobilenet_v2.yml b/configs/mobilenet_v2/mobilenet_v2.yml
deleted file mode 100644
index 5527ba82ba..0000000000
--- a/configs/mobilenet_v2/mobilenet_v2.yml
+++ /dev/null
@@ -1,169 +0,0 @@
-Models:
-- Name: fcn_m-v2-d8_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: M-V2-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 70.42
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 61.54
-  Config: configs/mobilenet_v2/fcn_m-v2-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x1024_80k_cityscapes/fcn_m-v2-d8_512x1024_80k_cityscapes_20200825_124817-d24c28c1.pth
-- Name: pspnet_m-v2-d8_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: M-V2-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 89.29
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 70.23
-  Config: configs/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes/pspnet_m-v2-d8_512x1024_80k_cityscapes_20200825_124817-19e81d51.pth
-- Name: deeplabv3_m-v2-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: M-V2-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 119.05
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.84
-  Config: configs/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x1024_80k_cityscapes/deeplabv3_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-bef03590.pth
-- Name: deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: M-V2-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 119.05
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.2
-  Config: configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes/deeplabv3plus_m-v2-d8_512x1024_80k_cityscapes_20200825_124836-d256dd4b.pth
-- Name: fcn_m-v2-d8_512x512_160k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: M-V2-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 15.53
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 19.71
-  Config: configs/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/fcn_m-v2-d8_512x512_160k_ade20k/fcn_m-v2-d8_512x512_160k_ade20k_20200825_214953-c40e1095.pth
-- Name: pspnet_m-v2-d8_512x512_160k_ade20k
-  In Collection: PSPNet
-  Metadata:
-    backbone: M-V2-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 17.33
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 29.68
-  Config: configs/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k/pspnet_m-v2-d8_512x512_160k_ade20k_20200825_214953-f5942f7a.pth
-- Name: deeplabv3_m-v2-d8_512x512_160k_ade20k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: M-V2-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 25.06
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 34.08
-  Config: configs/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3_m-v2-d8_512x512_160k_ade20k/deeplabv3_m-v2-d8_512x512_160k_ade20k_20200825_223255-63986343.pth
-- Name: deeplabv3plus_m-v2-d8_512x512_160k_ade20k
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: M-V2-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 23.2
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 34.02
-  Config: configs/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v2/deeplabv3plus_m-v2-d8_512x512_160k_ade20k/deeplabv3plus_m-v2-d8_512x512_160k_ade20k_20200825_223255-465a01d4.pth
diff --git a/configs/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes.py b/configs/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 7403bee864..0000000000
--- a/configs/mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = '../pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='mmcls://mobilenet_v2',
-    backbone=dict(
-        _delete_=True,
-        type='MobileNetV2',
-        widen_factor=1.,
-        strides=(1, 2, 2, 1, 1, 1, 1),
-        dilations=(1, 1, 1, 2, 2, 4, 4),
-        out_indices=(1, 2, 4, 6)),
-    decode_head=dict(in_channels=320),
-    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k.py b/configs/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 5b72ac830b..0000000000
--- a/configs/mobilenet_v2/pspnet_m-v2-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = '../pspnet/pspnet_r101-d8_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='mmcls://mobilenet_v2',
-    backbone=dict(
-        _delete_=True,
-        type='MobileNetV2',
-        widen_factor=1.,
-        strides=(1, 2, 2, 1, 1, 1, 1),
-        dilations=(1, 1, 1, 2, 2, 4, 4),
-        out_indices=(1, 2, 4, 6)),
-    decode_head=dict(in_channels=320),
-    auxiliary_head=dict(in_channels=96))
diff --git a/configs/mobilenet_v3/README.md b/configs/mobilenet_v3/README.md
index 66f20688b9..8ed0a5692a 100644
--- a/configs/mobilenet_v3/README.md
+++ b/configs/mobilenet_v3/README.md
@@ -1,6 +1,6 @@
 # MobileNetV3
 
-[Searching for MobileNetV3](https://arxiv.org/abs/1905.02244)
+> [Searching for MobileNetV3](https://arxiv.org/abs/1905.02244)
 
 ## Introduction
 
@@ -24,6 +24,17 @@ We present the next generation of MobileNets based on a combination of complemen
 <img src="https://user-images.githubusercontent.com/24582831/142902036-3dc2e0c0-d475-4816-b1ac-961836b41f5c.png" width="60%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                                                                                     |
+| ------ | ------------------ | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| LRASPP | M-V3-D8            | 512x1024  |  320000 |      8.9 | 15.22          | V100   | 69.54 | 70.89         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes_20201224_220337-cfe8fb07.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes-20201224_220337.log.json)                                     |
+| LRASPP | M-V3-D8 (scratch)  | 512x1024  |  320000 |      8.9 | 14.77          | V100   | 67.87 | 69.78         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes_20201224_220337-9f29cd72.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes-20201224_220337.log.json)     |
+| LRASPP | M-V3s-D8           | 512x1024  |  320000 |      5.3 | 23.64          | V100   | 64.11 | 66.42         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes_20201224_223935-61565b34.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes-20201224_223935.log.json)                                 |
+| LRASPP | M-V3s-D8 (scratch) | 512x1024  |  320000 |      5.3 | 24.50          | V100   | 62.74 | 65.01         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes_20201224_223935-03daeabb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes-20201224_223935.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -37,14 +48,3 @@ We present the next generation of MobileNets based on a combination of complemen
   doi={10.1109/ICCV.2019.00140}}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                                     |
-| ------ | ------------------ | --------- | ------: | -------: | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| LRASPP | M-V3-D8            | 512x1024  |  320000 |      8.9 | 15.22          | 69.54 | 70.89         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes_20201224_220337-cfe8fb07.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes-20201224_220337.log.json)                                     |
-| LRASPP | M-V3-D8 (scratch)  | 512x1024  |  320000 |      8.9 | 14.77          | 67.87 | 69.78         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes_20201224_220337-9f29cd72.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes-20201224_220337.log.json)     |
-| LRASPP | M-V3s-D8           | 512x1024  |  320000 |      5.3 | 23.64          | 64.11 | 66.42         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes_20201224_223935-61565b34.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes-20201224_223935.log.json)                                 |
-| LRASPP | M-V3s-D8 (scratch) | 512x1024  |  320000 |      5.3 | 24.50          | 62.74 | 65.01         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes_20201224_223935-03daeabb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes-20201224_223935.log.json) |
diff --git a/configs/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes.py b/configs/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes.py
deleted file mode 100644
index d4e368b2a1..0000000000
--- a/configs/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes.py
+++ /dev/null
@@ -1,23 +0,0 @@
-_base_ = './lraspp_m-v3-d8_512x1024_320k_cityscapes.py'
-norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='open-mmlab://contrib/mobilenet_v3_small',
-    backbone=dict(
-        type='MobileNetV3',
-        arch='small',
-        out_indices=(0, 1, 12),
-        norm_cfg=norm_cfg),
-    decode_head=dict(
-        type='LRASPPHead',
-        in_channels=(16, 16, 576),
-        in_index=(0, 1, 2),
-        channels=128,
-        input_transform='multiple_select',
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        act_cfg=dict(type='ReLU'),
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/configs/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes.py b/configs/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes.py
deleted file mode 100644
index 0c5f707200..0000000000
--- a/configs/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes.py
+++ /dev/null
@@ -1,22 +0,0 @@
-_base_ = './lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes.py'
-norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    backbone=dict(
-        type='MobileNetV3',
-        arch='small',
-        out_indices=(0, 1, 12),
-        norm_cfg=norm_cfg),
-    decode_head=dict(
-        type='LRASPPHead',
-        in_channels=(16, 16, 576),
-        in_index=(0, 1, 2),
-        channels=128,
-        input_transform='multiple_select',
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        act_cfg=dict(type='ReLU'),
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/configs/mobilenet_v3/metafile.yaml b/configs/mobilenet_v3/metafile.yaml
new file mode 100644
index 0000000000..0351d3b8e4
--- /dev/null
+++ b/configs/mobilenet_v3/metafile.yaml
@@ -0,0 +1,109 @@
+Collections:
+- Name: LRASPP
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  README: configs/mobilenet_v3/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024
+  In Collection: LRASPP
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 69.54
+      mIoU(ms+flip): 70.89
+  Config: configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M-V3-D8
+    - LRASPP
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes_20201224_220337-cfe8fb07.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes-20201224_220337.log.json
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
+  Framework: PyTorch
+- Name: mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024
+  In Collection: LRASPP
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 67.87
+      mIoU(ms+flip): 69.78
+  Config: configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M-V3-D8
+    - LRASPP
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes_20201224_220337-9f29cd72.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes-20201224_220337.log.json
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
+  Framework: PyTorch
+- Name: mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024
+  In Collection: LRASPP
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 64.11
+      mIoU(ms+flip): 66.42
+  Config: configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M-V3s-D8
+    - LRASPP
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes_20201224_223935-61565b34.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes-20201224_223935.log.json
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
+  Framework: PyTorch
+- Name: mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024
+  In Collection: LRASPP
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 62.74
+      mIoU(ms+flip): 65.01
+  Config: configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - M-V3s-D8
+    - LRASPP
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes_20201224_223935-03daeabb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes-20201224_223935.log.json
+  Paper:
+    Title: Searching for MobileNetV3
+    URL: https://arxiv.org/abs/1905.02244
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
+  Framework: PyTorch
diff --git a/configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py b/configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..bc6322fe40
--- /dev/null
+++ b/configs/mobilenet_v3/mobilenet-v3-d8-s_lraspp_4xb4-320k_cityscapes-512x1024.py
@@ -0,0 +1,23 @@
+_base_ = './mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='open-mmlab://contrib/mobilenet_v3_small',
+    backbone=dict(
+        type='MobileNetV3',
+        arch='small',
+        out_indices=(0, 1, 12),
+        norm_cfg=norm_cfg),
+    decode_head=dict(
+        type='LRASPPHead',
+        in_channels=(16, 16, 576),
+        in_index=(0, 1, 2),
+        channels=128,
+        input_transform='multiple_select',
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py b/configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..7260936e60
--- /dev/null
+++ b/configs/mobilenet_v3/mobilenet-v3-d8-scratch-s_lraspp_4xb4-320k_cityscapes-512x1024.py
@@ -0,0 +1,22 @@
+_base_ = './mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', eps=0.001, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        type='MobileNetV3',
+        arch='small',
+        out_indices=(0, 1, 12),
+        norm_cfg=norm_cfg),
+    decode_head=dict(
+        type='LRASPPHead',
+        in_channels=(16, 16, 576),
+        in_index=(0, 1, 2),
+        channels=128,
+        input_transform='multiple_select',
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU'),
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/configs/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes.py b/configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py
similarity index 100%
rename from configs/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes.py
rename to configs/mobilenet_v3/mobilenet-v3-d8-scratch_lraspp_4xb4-320k_cityscapes-512x1024.py
diff --git a/configs/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes.py b/configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py
similarity index 100%
rename from configs/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes.py
rename to configs/mobilenet_v3/mobilenet-v3-d8_lraspp_4xb4-320k_cityscapes-512x1024.py
diff --git a/configs/mobilenet_v3/mobilenet_v3.yml b/configs/mobilenet_v3/mobilenet_v3.yml
deleted file mode 100644
index 003cbe530c..0000000000
--- a/configs/mobilenet_v3/mobilenet_v3.yml
+++ /dev/null
@@ -1,103 +0,0 @@
-Collections:
-- Name: LRASPP
-  Metadata:
-    Training Data:
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/1905.02244
-    Title: Searching for MobileNetV3
-  README: configs/mobilenet_v3/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mobilenet_v3.py#L15
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/tensorflow/models/tree/master/research/deeplab
-Models:
-- Name: lraspp_m-v3-d8_512x1024_320k_cityscapes
-  In Collection: LRASPP
-  Metadata:
-    backbone: M-V3-D8
-    crop size: (512,1024)
-    lr schd: 320000
-    inference time (ms/im):
-    - value: 65.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 69.54
-      mIoU(ms+flip): 70.89
-  Config: configs/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_512x1024_320k_cityscapes/lraspp_m-v3-d8_512x1024_320k_cityscapes_20201224_220337-cfe8fb07.pth
-- Name: lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes
-  In Collection: LRASPP
-  Metadata:
-    backbone: M-V3-D8 (scratch)
-    crop size: (512,1024)
-    lr schd: 320000
-    inference time (ms/im):
-    - value: 67.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 67.87
-      mIoU(ms+flip): 69.78
-  Config: configs/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3-d8_scratch_512x1024_320k_cityscapes_20201224_220337-9f29cd72.pth
-- Name: lraspp_m-v3s-d8_512x1024_320k_cityscapes
-  In Collection: LRASPP
-  Metadata:
-    backbone: M-V3s-D8
-    crop size: (512,1024)
-    lr schd: 320000
-    inference time (ms/im):
-    - value: 42.3
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 64.11
-      mIoU(ms+flip): 66.42
-  Config: configs/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_512x1024_320k_cityscapes/lraspp_m-v3s-d8_512x1024_320k_cityscapes_20201224_223935-61565b34.pth
-- Name: lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes
-  In Collection: LRASPP
-  Metadata:
-    backbone: M-V3s-D8 (scratch)
-    crop size: (512,1024)
-    lr schd: 320000
-    inference time (ms/im):
-    - value: 40.82
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 5.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 62.74
-      mIoU(ms+flip): 65.01
-  Config: configs/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mobilenet_v3/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes/lraspp_m-v3s-d8_scratch_512x1024_320k_cityscapes_20201224_223935-03daeabb.pth
diff --git a/configs/nonlocal_net/README.md b/configs/nonlocal_net/README.md
index 1109599332..4c3f49f981 100644
--- a/configs/nonlocal_net/README.md
+++ b/configs/nonlocal_net/README.md
@@ -1,6 +1,6 @@
 # NonLocal Net
 
-[Non-local Neural Networks](https://arxiv.org/abs/1711.07971)
+> [Non-local Neural Networks](https://arxiv.org/abs/1711.07971)
 
 ## Introduction
 
@@ -22,6 +22,39 @@ Both convolutional and recurrent operations are building blocks that process one
 <img src="https://user-images.githubusercontent.com/24582831/142902128-17e29678-bf12-4ff4-b3d6-a39b47dfd253.png" width="50%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                                                     |
+| ----------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | -------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NonLocalNet | R-50-D8  | 512x1024  |   40000 | 7.4      | 2.72           | V100   | 78.24 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748-c75e81e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748.log.json)     |
+| NonLocalNet | R-101-D8 | 512x1024  |   40000 | 10.9     | 1.95           | V100   | 78.66 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748-d63729fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748.log.json) |
+| NonLocalNet | R-50-D8  | 769x769   |   40000 | 8.9      | 1.52           | V100   | 78.33 | 79.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243-82ef6749.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243.log.json)         |
+| NonLocalNet | R-101-D8 | 769x769   |   40000 | 12.8     | 1.05           | V100   | 78.57 | 80.29         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348-8fe9a9dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348.log.json)     |
+| NonLocalNet | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 78.01 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518-d6839fae.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518.log.json)     |
+| NonLocalNet | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 78.93 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411-32700183.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411.log.json) |
+| NonLocalNet | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.05 | 80.68         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506-1f9792f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506.log.json)         |
+| NonLocalNet | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.40 | 80.85         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428-0e1fa4f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428.log.json)     |
+
+### ADE20K
+
+| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                     |
+| ----------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| NonLocalNet | R-50-D8  | 512x512   |   80000 | 9.1      | 21.37          | V100   | 40.75 |         42.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801-5ae0aa33.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801.log.json)         |
+| NonLocalNet | R-101-D8 | 512x512   |   80000 | 12.6     | 13.97          | V100   | 42.90 |         44.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758-24105919.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758.log.json)     |
+| NonLocalNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.03 |         43.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410-baef45e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410.log.json)     |
+| NonLocalNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 44.63 |         45.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502-7881aa1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                         |
+| ----------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| NonLocalNet | R-50-D8  | 512x512   |   20000 | 6.4      | 21.21          | V100   | 76.20 |         77.12 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613-07f2a57c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613.log.json)     |
+| NonLocalNet | R-101-D8 | 512x512   |   20000 | 9.8      | 14.01          | V100   | 78.15 |         78.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615-948c68ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615.log.json) |
+| NonLocalNet | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.65 |         77.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028-0139d4a9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028.log.json)     |
+| NonLocalNet | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 78.27 |         79.12 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028-7e5ff470.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,36 +66,3 @@ Both convolutional and recurrent operations are building blocks that process one
   year={2018}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                     |
-| ----------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| NonLocalNet | R-50-D8  | 512x1024  |   40000 | 7.4      | 2.72           | 78.24 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748-c75e81e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748.log.json)     |
-| NonLocalNet | R-101-D8 | 512x1024  |   40000 | 10.9     | 1.95           | 78.66 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748-d63729fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748.log.json) |
-| NonLocalNet | R-50-D8  | 769x769   |   40000 | 8.9      | 1.52           | 78.33 | 79.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243-82ef6749.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243.log.json)         |
-| NonLocalNet | R-101-D8 | 769x769   |   40000 | 12.8     | 1.05           | 78.57 | 80.29         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348-8fe9a9dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348.log.json)     |
-| NonLocalNet | R-50-D8  | 512x1024  |   80000 | -        | -              | 78.01 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518-d6839fae.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518.log.json)     |
-| NonLocalNet | R-101-D8 | 512x1024  |   80000 | -        | -              | 78.93 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411-32700183.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411.log.json) |
-| NonLocalNet | R-50-D8  | 769x769   |   80000 | -        | -              | 79.05 | 80.68         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506-1f9792f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506.log.json)         |
-| NonLocalNet | R-101-D8 | 769x769   |   80000 | -        | -              | 79.40 | 80.85         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428-0e1fa4f9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428.log.json)     |
-
-### ADE20K
-
-| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                     |
-| ----------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| NonLocalNet | R-50-D8  | 512x512   |   80000 | 9.1      | 21.37          | 40.75 |         42.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801-5ae0aa33.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801.log.json)         |
-| NonLocalNet | R-101-D8 | 512x512   |   80000 | 12.6     | 13.97          | 42.90 |         44.27 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758-24105919.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758.log.json)     |
-| NonLocalNet | R-50-D8  | 512x512   |  160000 | -        | -              | 42.03 |         43.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410-baef45e3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410.log.json)     |
-| NonLocalNet | R-101-D8 | 512x512   |  160000 | -        | -              | 44.63 |         45.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502-7881aa1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502.log.json) |
-
-### Pascal VOC 2012 + Aug
-
-| Method      | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                         |
-| ----------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| NonLocalNet | R-50-D8  | 512x512   |   20000 | 6.4      | 21.21          | 76.20 |         77.12 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613-07f2a57c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613.log.json)     |
-| NonLocalNet | R-101-D8 | 512x512   |   20000 | 9.8      | 14.01          | 78.15 |         78.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615-948c68ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615.log.json) |
-| NonLocalNet | R-50-D8  | 512x512   |   40000 | -        | -              | 76.65 |         77.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028-0139d4a9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028.log.json)     |
-| NonLocalNet | R-101-D8 | 512x512   |   40000 | -        | -              | 78.27 |         79.12 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028-7e5ff470.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028.log.json) |
diff --git a/configs/nonlocal_net/metafile.yaml b/configs/nonlocal_net/metafile.yaml
new file mode 100644
index 0000000000..69bd72570b
--- /dev/null
+++ b/configs/nonlocal_net/metafile.yaml
@@ -0,0 +1,387 @@
+Collections:
+- Name: NonLocalNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  README: configs/nonlocal_net/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.24
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748-c75e81e3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.66
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748-d63729fa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.33
+      mIoU(ms+flip): 79.92
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243-82ef6749.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.57
+      mIoU(ms+flip): 80.29
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348-8fe9a9dc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.01
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518-d6839fae.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.93
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411-32700183.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.05
+      mIoU(ms+flip): 80.68
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506-1f9792f6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.4
+      mIoU(ms+flip): 80.85
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428-0e1fa4f9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.75
+      mIoU(ms+flip): 42.05
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801-5ae0aa33.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.9
+      mIoU(ms+flip): 44.27
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758-24105919.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.03
+      mIoU(ms+flip): 43.04
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410-baef45e3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.63
+      mIoU(ms+flip): 45.79
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502-7881aa1a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.2
+      mIoU(ms+flip): 77.12
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613-07f2a57c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.15
+      mIoU(ms+flip): 78.86
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615-948c68ab.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.65
+      mIoU(ms+flip): 77.47
+  Config: configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028-0139d4a9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
+- Name: nonlocal_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: NonLocalNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.27
+      mIoU(ms+flip): 79.12
+  Config: configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - NonLocalNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028-7e5ff470.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028.log.json
+  Paper:
+    Title: Non-local Neural Networks
+    URL: https://arxiv.org/abs/1711.07971
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
+  Framework: PyTorch
diff --git a/configs/nonlocal_net/nonlocal_net.yml b/configs/nonlocal_net/nonlocal_net.yml
deleted file mode 100644
index bab38ce9c2..0000000000
--- a/configs/nonlocal_net/nonlocal_net.yml
+++ /dev/null
@@ -1,301 +0,0 @@
-Collections:
-- Name: NonLocalNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://arxiv.org/abs/1711.07971
-    Title: Non-local Neural Networks
-  README: configs/nonlocal_net/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/nl_head.py#L10
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/facebookresearch/video-nonlocal-net
-Models:
-- Name: nonlocal_r50-d8_512x1024_40k_cityscapes
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 367.65
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.24
-  Config: configs/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes/nonlocal_r50-d8_512x1024_40k_cityscapes_20200605_210748-c75e81e3.pth
-- Name: nonlocal_r101-d8_512x1024_40k_cityscapes
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 512.82
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 10.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.66
-  Config: configs/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes/nonlocal_r101-d8_512x1024_40k_cityscapes_20200605_210748-d63729fa.pth
-- Name: nonlocal_r50-d8_769x769_40k_cityscapes
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 657.89
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 8.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.33
-      mIoU(ms+flip): 79.92
-  Config: configs/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes/nonlocal_r50-d8_769x769_40k_cityscapes_20200530_045243-82ef6749.pth
-- Name: nonlocal_r101-d8_769x769_40k_cityscapes
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 952.38
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 12.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.57
-      mIoU(ms+flip): 80.29
-  Config: configs/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes/nonlocal_r101-d8_769x769_40k_cityscapes_20200530_045348-8fe9a9dc.pth
-- Name: nonlocal_r50-d8_512x1024_80k_cityscapes
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.01
-  Config: configs/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes/nonlocal_r50-d8_512x1024_80k_cityscapes_20200607_193518-d6839fae.pth
-- Name: nonlocal_r101-d8_512x1024_80k_cityscapes
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.93
-  Config: configs/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes/nonlocal_r101-d8_512x1024_80k_cityscapes_20200607_183411-32700183.pth
-- Name: nonlocal_r50-d8_769x769_80k_cityscapes
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.05
-      mIoU(ms+flip): 80.68
-  Config: configs/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes/nonlocal_r50-d8_769x769_80k_cityscapes_20200607_193506-1f9792f6.pth
-- Name: nonlocal_r101-d8_769x769_80k_cityscapes
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.4
-      mIoU(ms+flip): 80.85
-  Config: configs/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes/nonlocal_r101-d8_769x769_80k_cityscapes_20200607_183428-0e1fa4f9.pth
-- Name: nonlocal_r50-d8_512x512_80k_ade20k
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 46.79
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 40.75
-      mIoU(ms+flip): 42.05
-  Config: configs/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k/nonlocal_r50-d8_512x512_80k_ade20k_20200615_015801-5ae0aa33.pth
-- Name: nonlocal_r101-d8_512x512_80k_ade20k
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 71.58
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.9
-      mIoU(ms+flip): 44.27
-  Config: configs/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k/nonlocal_r101-d8_512x512_80k_ade20k_20200615_015758-24105919.pth
-- Name: nonlocal_r50-d8_512x512_160k_ade20k
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.03
-      mIoU(ms+flip): 43.04
-  Config: configs/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k/nonlocal_r50-d8_512x512_160k_ade20k_20200616_005410-baef45e3.pth
-- Name: nonlocal_r101-d8_512x512_160k_ade20k
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.63
-      mIoU(ms+flip): 45.79
-  Config: configs/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k/nonlocal_r101-d8_512x512_160k_ade20k_20210827_221502-7881aa1a.pth
-- Name: nonlocal_r50-d8_512x512_20k_voc12aug
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 47.15
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.2
-      mIoU(ms+flip): 77.12
-  Config: configs/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug/nonlocal_r50-d8_512x512_20k_voc12aug_20200617_222613-07f2a57c.pth
-- Name: nonlocal_r101-d8_512x512_20k_voc12aug
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 71.38
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 78.15
-      mIoU(ms+flip): 78.86
-  Config: configs/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug/nonlocal_r101-d8_512x512_20k_voc12aug_20200617_222615-948c68ab.pth
-- Name: nonlocal_r50-d8_512x512_40k_voc12aug
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.65
-      mIoU(ms+flip): 77.47
-  Config: configs/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug/nonlocal_r50-d8_512x512_40k_voc12aug_20200614_000028-0139d4a9.pth
-- Name: nonlocal_r101-d8_512x512_40k_voc12aug
-  In Collection: NonLocalNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 78.27
-      mIoU(ms+flip): 79.12
-  Config: configs/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug/nonlocal_r101-d8_512x512_40k_voc12aug_20200614_000028-7e5ff470.pth
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..5fcf7bcb16
--- /dev/null
+++ b/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..ee984c2bbd
--- /dev/null
+++ b/configs/nonlocal_net/nonlocal_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..aca80d676a
--- /dev/null
+++ b/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..8a7aeea7f6
--- /dev/null
+++ b/configs/nonlocal_net/nonlocal_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..0cdb3caaf3
--- /dev/null
+++ b/configs/nonlocal_net/nonlocal_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..a7cacea517
--- /dev/null
+++ b/configs/nonlocal_net/nonlocal_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..ec475443e8
--- /dev/null
+++ b/configs/nonlocal_net/nonlocal_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..ca79f6fdc0
--- /dev/null
+++ b/configs/nonlocal_net/nonlocal_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes.py b/configs/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index ef7b06dd38..0000000000
--- a/configs/nonlocal_net/nonlocal_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './nonlocal_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes.py b/configs/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 7a1e66cf1c..0000000000
--- a/configs/nonlocal_net/nonlocal_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './nonlocal_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k.py b/configs/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index df9c2aca9c..0000000000
--- a/configs/nonlocal_net/nonlocal_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './nonlocal_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug.py b/configs/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 490f9873a2..0000000000
--- a/configs/nonlocal_net/nonlocal_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './nonlocal_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug.py b/configs/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index 40d9190fba..0000000000
--- a/configs/nonlocal_net/nonlocal_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './nonlocal_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k.py b/configs/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index 0c6f60dac7..0000000000
--- a/configs/nonlocal_net/nonlocal_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './nonlocal_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes.py b/configs/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index 23e6da7f23..0000000000
--- a/configs/nonlocal_net/nonlocal_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './nonlocal_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes.py b/configs/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 0627e2b5a7..0000000000
--- a/configs/nonlocal_net/nonlocal_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './nonlocal_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes.py b/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes.py
rename to configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes.py b/configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/nonlocal_net/nonlocal_r50-d8_769x769_40k_cityscapes.py
rename to configs/nonlocal_net/nonlocal_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes.py b/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/nonlocal_net/nonlocal_r50-d8_512x1024_80k_cityscapes.py
rename to configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes.py b/configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/nonlocal_net/nonlocal_r50-d8_769x769_80k_cityscapes.py
rename to configs/nonlocal_net/nonlocal_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k.py b/configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/nonlocal_net/nonlocal_r50-d8_512x512_160k_ade20k.py
rename to configs/nonlocal_net/nonlocal_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug.py b/configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/nonlocal_net/nonlocal_r50-d8_512x512_20k_voc12aug.py
rename to configs/nonlocal_net/nonlocal_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug.py b/configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/nonlocal_net/nonlocal_r50-d8_512x512_40k_voc12aug.py
rename to configs/nonlocal_net/nonlocal_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k.py b/configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/nonlocal_net/nonlocal_r50-d8_512x512_80k_ade20k.py
rename to configs/nonlocal_net/nonlocal_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/ocrnet/README.md b/configs/ocrnet/README.md
index 1c3dba2b62..628a3b1597 100644
--- a/configs/ocrnet/README.md
+++ b/configs/ocrnet/README.md
@@ -1,6 +1,6 @@
 # OCRNet
 
-[Object-Contextual Representations for Semantic Segmentation](https://arxiv.org/abs/1909.11065)
+> [Object-Contextual Representations for Semantic Segmentation](https://arxiv.org/abs/1909.11065)
 
 ## Introduction
 
@@ -22,6 +22,54 @@ In this paper, we address the problem of semantic segmentation and focus on the
 <img src="https://user-images.githubusercontent.com/24582831/142902197-b06b1e04-57ab-44ac-adc8-cea6695bb236.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+#### HRNet backbone
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                         |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| OCRNet | HRNetV2p-W18-Small | 512x1024  |   40000 | 3.5      | 10.45          | A100   | 76.61 |         78.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024_20230227_145026-6c052a14.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024_20230227_145026.json) |
+| OCRNet | HRNetV2p-W18       | 512x1024  |   40000 | 4.7      | 7.50           | V100   | 77.72 |         79.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320-401c5bdd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320.log.json)                     |
+| OCRNet | HRNetV2p-W48       | 512x1024  |   40000 | 8        | 4.22           | V100   | 80.58 |         81.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.pyy)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336-55b32491.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336.log.json)                     |
+| OCRNet | HRNetV2p-W18-Small | 512x1024  |   80000 | -        | -              | V100   | 77.16 |         78.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735-55979e63.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735.log.json)                 |
+| OCRNet | HRNetV2p-W18       | 512x1024  |   80000 | -        | -              | V100   | 78.57 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521-c2e1dd4a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521.log.json)                     |
+| OCRNet | HRNetV2p-W48       | 512x1024  |   80000 | -        | -              | V100   | 80.70 |         81.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752-9076bcdf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752.log.json)                     |
+| OCRNet | HRNetV2p-W18-Small | 512x1024  |  160000 | -        | -              | V100   | 78.45 |         79.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005-f4a7af28.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005.log.json)             |
+| OCRNet | HRNetV2p-W18       | 512x1024  |  160000 | -        | -              | V100   | 79.47 |         80.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001-b9172d0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001.log.json)                 |
+| OCRNet | HRNetV2p-W48       | 512x1024  |  160000 | -        | -              | V100   | 81.35 |         82.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037-dfbf1b0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037.log.json)                 |
+
+#### ResNet backbone
+
+| Method | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| OCRNet | R-101-D8 | 512x1024  | 8          | 40000   | -        | -              | V100   | 80.09 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721-02ac0f13.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721.log.json)     |
+| OCRNet | R-101-D8 | 512x1024  | 16         | 40000   | 8.8      | 3.02           | V100   | 80.30 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726-db500f80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726.log.json) |
+| OCRNet | R-101-D8 | 512x1024  | 16         | 80000   | 8.8      | 3.02           | V100   | 80.81 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421-78688424.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421.log.json) |
+
+### ADE20K
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                         |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| OCRNet | HRNetV2p-W18-Small | 512x512   |   80000 | 6.7      | 28.98          | V100   | 35.06 |         35.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600-e80b62af.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600.log.json)     |
+| OCRNet | HRNetV2p-W18       | 512x512   |   80000 | 7.9      | 18.93          | V100   | 37.79 |         39.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157-d173d83b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157.log.json)         |
+| OCRNet | HRNetV2p-W48       | 512x512   |   80000 | 11.2     | 16.99          | V100   | 43.00 |         44.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518-d168c2d1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518.log.json)         |
+| OCRNet | HRNetV2p-W18-Small | 512x512   |  160000 | -        | -              | V100   | 37.19 |         38.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505-8e913058.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505.log.json) |
+| OCRNet | HRNetV2p-W18       | 512x512   |  160000 | -        | -              | V100   | 39.32 |         40.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940-d8fcd9d1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940.log.json)     |
+| OCRNet | HRNetV2p-W48       | 512x512   |  160000 | -        | -              | V100   | 43.25 |         44.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705-a073726d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705.log.json)     |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                             |
+| ------ | ------------------ | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| OCRNet | HRNetV2p-W18-Small | 512x512   |   20000 | 3.5      | 31.55          | V100   | 71.70 |         73.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913-02b04fcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913.log.json) |
+| OCRNet | HRNetV2p-W18       | 512x512   |   20000 | 4.7      | 19.91          | V100   | 74.75 |         77.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932-8954cbb7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932.log.json)     |
+| OCRNet | HRNetV2p-W48       | 512x512   |   20000 | 8.1      | 17.83          | V100   | 77.72 |         79.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932-9e82080a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932.log.json)     |
+| OCRNet | HRNetV2p-W18-Small | 512x512   |   40000 | -        | -              | V100   | 72.76 |         74.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025-42b587ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025.log.json) |
+| OCRNet | HRNetV2p-W18       | 512x512   |   40000 | -        | -              | V100   | 74.98 |         77.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958-714302be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958.log.json)     |
+| OCRNet | HRNetV2p-W48       | 512x512   |   40000 | -        | -              | V100   | 77.14 |         79.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958-255bc5ce.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958.log.json)     |
+
 ## Citation
 
 ```bibtex
@@ -39,51 +87,3 @@ In this paper, we address the problem of semantic segmentation and focus on the
   year={2020}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-#### HRNet backbone
-
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                             |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| OCRNet | HRNetV2p-W18-Small | 512x1024  |   40000 | 3.5      | 10.45          | 74.30 |         75.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes/ocrnet_hr18s_512x1024_40k_cityscapes_20200601_033304-fa2436c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes/ocrnet_hr18s_512x1024_40k_cityscapes_20200601_033304.log.json)     |
-| OCRNet | HRNetV2p-W18       | 512x1024  |   40000 | 4.7      | 7.50           | 77.72 |         79.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320-401c5bdd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320.log.json)         |
-| OCRNet | HRNetV2p-W48       | 512x1024  |   40000 | 8        | 4.22           | 80.58 |         81.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336-55b32491.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336.log.json)         |
-| OCRNet | HRNetV2p-W18-Small | 512x1024  |   80000 | -        | -              | 77.16 |         78.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735-55979e63.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735.log.json)     |
-| OCRNet | HRNetV2p-W18       | 512x1024  |   80000 | -        | -              | 78.57 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521-c2e1dd4a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521.log.json)         |
-| OCRNet | HRNetV2p-W48       | 512x1024  |   80000 | -        | -              | 80.70 |         81.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752-9076bcdf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752.log.json)         |
-| OCRNet | HRNetV2p-W18-Small | 512x1024  |  160000 | -        | -              | 78.45 |         79.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005-f4a7af28.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005.log.json) |
-| OCRNet | HRNetV2p-W18       | 512x1024  |  160000 | -        | -              | 79.47 |         80.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001-b9172d0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001.log.json)     |
-| OCRNet | HRNetV2p-W48       | 512x1024  |  160000 | -        | -              | 81.35 |         82.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037-dfbf1b0c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037.log.json)     |
-
-#### ResNet backbone
-
-| Method | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| OCRNet | R-101-D8 | 512x1024  | 8          | 40000   | -        | -              | 80.09 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721-02ac0f13.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721.log.json)     |
-| OCRNet | R-101-D8 | 512x1024  | 16         | 40000   | 8.8      | 3.02           | 80.30 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726-db500f80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726.log.json) |
-| OCRNet | R-101-D8 | 512x1024  | 16         | 80000   | 8.8      | 3.02           | 80.81 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421-78688424.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421.log.json) |
-
-### ADE20K
-
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                         |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| OCRNet | HRNetV2p-W18-Small | 512x512   |   80000 | 6.7      | 28.98          | 35.06 |         35.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18s_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600-e80b62af.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600.log.json)     |
-| OCRNet | HRNetV2p-W18       | 512x512   |   80000 | 7.9      | 18.93          | 37.79 |         39.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157-d173d83b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157.log.json)         |
-| OCRNet | HRNetV2p-W48       | 512x512   |   80000 | 11.2     | 16.99          | 43.00 |         44.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr48_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518-d168c2d1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518.log.json)         |
-| OCRNet | HRNetV2p-W18-Small | 512x512   |  160000 | -        | -              | 37.19 |         38.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18s_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505-8e913058.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505.log.json) |
-| OCRNet | HRNetV2p-W18       | 512x512   |  160000 | -        | -              | 39.32 |         40.80 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940-d8fcd9d1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940.log.json)     |
-| OCRNet | HRNetV2p-W48       | 512x512   |  160000 | -        | -              | 43.25 |         44.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr48_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705-a073726d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705.log.json)     |
-
-### Pascal VOC 2012 + Aug
-
-| Method | Backbone           | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                             |
-| ------ | ------------------ | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| OCRNet | HRNetV2p-W18-Small | 512x512   |   20000 | 3.5      | 31.55          | 71.70 |         73.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913-02b04fcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913.log.json) |
-| OCRNet | HRNetV2p-W18       | 512x512   |   20000 | 4.7      | 19.91          | 74.75 |         77.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932-8954cbb7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932.log.json)     |
-| OCRNet | HRNetV2p-W48       | 512x512   |   20000 | 8.1      | 17.83          | 77.72 |         79.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr48_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932-9e82080a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932.log.json)     |
-| OCRNet | HRNetV2p-W18-Small | 512x512   |   40000 | -        | -              | 72.76 |         74.60 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025-42b587ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025.log.json) |
-| OCRNet | HRNetV2p-W18       | 512x512   |   40000 | -        | -              | 74.98 |         77.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr18_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958-714302be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958.log.json)     |
-| OCRNet | HRNetV2p-W48       | 512x512   |   40000 | -        | -              | 77.14 |         79.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet/ocrnet_hr48_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958-255bc5ce.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958.log.json)     |
diff --git a/configs/ocrnet/metafile.yaml b/configs/ocrnet/metafile.yaml
new file mode 100644
index 0000000000..5467feb975
--- /dev/null
+++ b/configs/ocrnet/metafile.yaml
@@ -0,0 +1,577 @@
+Collections:
+- Name: OCRNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - '# HRNet backbone'
+    - '# ResNet backbone'
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  README: configs/ocrnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: ocrnet_hr18s_4xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 76.61
+      mIoU(ms+flip): 78.01
+  Config: configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x A100 GPUS
+    Memory (GB): 3.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024_20230227_145026-6c052a14.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024_20230227_145026.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 77.72
+      mIoU(ms+flip): 79.49
+  Config: configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320-401c5bdd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 80.58
+      mIoU(ms+flip): 81.79
+  Config: configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336-55b32491.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb2-80k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 77.16
+      mIoU(ms+flip): 78.66
+  Config: configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735-55979e63.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb2-80k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 78.57
+      mIoU(ms+flip): 80.46
+  Config: configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521-c2e1dd4a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb2-80k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 80.7
+      mIoU(ms+flip): 81.87
+  Config: configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752-9076bcdf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb2-160k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 78.45
+      mIoU(ms+flip): 79.97
+  Config: configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005-f4a7af28.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb2-160k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 79.47
+      mIoU(ms+flip): 80.91
+  Config: configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001-b9172d0c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb2-160k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# HRNet backbone'
+    Metrics:
+      mIoU: 81.35
+      mIoU(ms+flip): 82.7
+  Config: configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# HRNet backbone'
+    Batch Size: 8
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037-dfbf1b0c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# ResNet backbone'
+    Metrics:
+      mIoU: 80.09
+  Config: configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# ResNet backbone'
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721-02ac0f13.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# ResNet backbone'
+    Metrics:
+      mIoU: 80.3
+  Config: configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# ResNet backbone'
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - OCRNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726-db500f80.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: '# ResNet backbone'
+    Metrics:
+      mIoU: 80.81
+  Config: configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: '# ResNet backbone'
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - OCRNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421-78688424.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 35.06
+      mIoU(ms+flip): 35.8
+  Config: configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600-e80b62af.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.79
+      mIoU(ms+flip): 39.16
+  Config: configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157-d173d83b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.0
+      mIoU(ms+flip): 44.3
+  Config: configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518-d168c2d1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.19
+      mIoU(ms+flip): 38.4
+  Config: configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505-8e913058.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb4-80k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.32
+      mIoU(ms+flip): 40.8
+  Config: configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940-d8fcd9d1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb4-160k_ade20k-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.25
+      mIoU(ms+flip): 44.88
+  Config: configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705-a073726d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb4-20k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 71.7
+      mIoU(ms+flip): 73.84
+  Config: configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913-02b04fcb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb4-20k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.75
+      mIoU(ms+flip): 77.11
+  Config: configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932-8954cbb7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb4-20k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.72
+      mIoU(ms+flip): 79.87
+  Config: configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932-9e82080a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18s_4xb4-40k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 72.76
+      mIoU(ms+flip): 74.6
+  Config: configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18-Small
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025-42b587ac.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr18_4xb4-40k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.98
+      mIoU(ms+flip): 77.4
+  Config: configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W18
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958-714302be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
+- Name: ocrnet_hr48_4xb4-40k_voc12aug-512x512
+  In Collection: OCRNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.14
+      mIoU(ms+flip): 79.71
+  Config: configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - HRNetV2p-W48
+    - OCRNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958-255bc5ce.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958.log.json
+  Paper:
+    Title: Object-Contextual Representations for Semantic Segmentation
+    URL: https://arxiv.org/abs/1909.11065
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
+  Framework: PyTorch
diff --git a/configs/ocrnet/ocrnet.yml b/configs/ocrnet/ocrnet.yml
deleted file mode 100644
index d599f0a533..0000000000
--- a/configs/ocrnet/ocrnet.yml
+++ /dev/null
@@ -1,438 +0,0 @@
-Collections:
-- Name: OCRNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://arxiv.org/abs/1909.11065
-    Title: Object-Contextual Representations for Semantic Segmentation
-  README: configs/ocrnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/ocr_head.py#L86
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/openseg-group/OCNet.pytorch
-Models:
-- Name: ocrnet_hr18s_512x1024_40k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 95.69
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.3
-      mIoU(ms+flip): 75.95
-  Config: configs/ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes/ocrnet_hr18s_512x1024_40k_cityscapes_20200601_033304-fa2436c2.pth
-- Name: ocrnet_hr18_512x1024_40k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 133.33
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 4.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.72
-      mIoU(ms+flip): 79.49
-  Config: configs/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes/ocrnet_hr18_512x1024_40k_cityscapes_20200601_033320-401c5bdd.pth
-- Name: ocrnet_hr48_512x1024_40k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 236.97
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.58
-      mIoU(ms+flip): 81.79
-  Config: configs/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes/ocrnet_hr48_512x1024_40k_cityscapes_20200601_033336-55b32491.pth
-- Name: ocrnet_hr18s_512x1024_80k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.16
-      mIoU(ms+flip): 78.66
-  Config: configs/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes/ocrnet_hr18s_512x1024_80k_cityscapes_20200601_222735-55979e63.pth
-- Name: ocrnet_hr18_512x1024_80k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.57
-      mIoU(ms+flip): 80.46
-  Config: configs/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes/ocrnet_hr18_512x1024_80k_cityscapes_20200614_230521-c2e1dd4a.pth
-- Name: ocrnet_hr48_512x1024_80k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.7
-      mIoU(ms+flip): 81.87
-  Config: configs/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes/ocrnet_hr48_512x1024_80k_cityscapes_20200601_222752-9076bcdf.pth
-- Name: ocrnet_hr18s_512x1024_160k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.45
-      mIoU(ms+flip): 79.97
-  Config: configs/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes/ocrnet_hr18s_512x1024_160k_cityscapes_20200602_191005-f4a7af28.pth
-- Name: ocrnet_hr18_512x1024_160k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.47
-      mIoU(ms+flip): 80.91
-  Config: configs/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes/ocrnet_hr18_512x1024_160k_cityscapes_20200602_191001-b9172d0c.pth
-- Name: ocrnet_hr48_512x1024_160k_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,1024)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 81.35
-      mIoU(ms+flip): 82.7
-  Config: configs/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes/ocrnet_hr48_512x1024_160k_cityscapes_20200602_191037-dfbf1b0c.pth
-- Name: ocrnet_r101-d8_512x1024_40k_b8_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.09
-  Config: configs/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes/ocrnet_r101-d8_512x1024_40k_b8_cityscapes_20200717_110721-02ac0f13.pth
-- Name: ocrnet_r101-d8_512x1024_40k_b16_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 331.13
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.3
-  Config: configs/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes/ocrnet_r101-d8_512x1024_40k_b16_cityscapes_20200723_193726-db500f80.pth
-- Name: ocrnet_r101-d8_512x1024_80k_b16_cityscapes
-  In Collection: OCRNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 331.13
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.81
-  Config: configs/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes/ocrnet_r101-d8_512x1024_80k_b16_cityscapes_20200723_192421-78688424.pth
-- Name: ocrnet_hr18s_512x512_80k_ade20k
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 34.51
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 35.06
-      mIoU(ms+flip): 35.8
-  Config: configs/ocrnet/ocrnet_hr18s_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_80k_ade20k/ocrnet_hr18s_512x512_80k_ade20k_20200615_055600-e80b62af.pth
-- Name: ocrnet_hr18_512x512_80k_ade20k
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 52.83
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 37.79
-      mIoU(ms+flip): 39.16
-  Config: configs/ocrnet/ocrnet_hr18_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_80k_ade20k/ocrnet_hr18_512x512_80k_ade20k_20200615_053157-d173d83b.pth
-- Name: ocrnet_hr48_512x512_80k_ade20k
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 58.86
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 11.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.0
-      mIoU(ms+flip): 44.3
-  Config: configs/ocrnet/ocrnet_hr48_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_80k_ade20k/ocrnet_hr48_512x512_80k_ade20k_20200615_021518-d168c2d1.pth
-- Name: ocrnet_hr18s_512x512_160k_ade20k
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 37.19
-      mIoU(ms+flip): 38.4
-  Config: configs/ocrnet/ocrnet_hr18s_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_160k_ade20k/ocrnet_hr18s_512x512_160k_ade20k_20200615_184505-8e913058.pth
-- Name: ocrnet_hr18_512x512_160k_ade20k
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 39.32
-      mIoU(ms+flip): 40.8
-  Config: configs/ocrnet/ocrnet_hr18_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_160k_ade20k/ocrnet_hr18_512x512_160k_ade20k_20200615_200940-d8fcd9d1.pth
-- Name: ocrnet_hr48_512x512_160k_ade20k
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.25
-      mIoU(ms+flip): 44.88
-  Config: configs/ocrnet/ocrnet_hr48_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_160k_ade20k/ocrnet_hr48_512x512_160k_ade20k_20200615_184705-a073726d.pth
-- Name: ocrnet_hr18s_512x512_20k_voc12aug
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 31.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 3.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 71.7
-      mIoU(ms+flip): 73.84
-  Config: configs/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug/ocrnet_hr18s_512x512_20k_voc12aug_20200617_233913-02b04fcb.pth
-- Name: ocrnet_hr18_512x512_20k_voc12aug
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 50.23
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 4.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 74.75
-      mIoU(ms+flip): 77.11
-  Config: configs/ocrnet/ocrnet_hr18_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_20k_voc12aug/ocrnet_hr18_512x512_20k_voc12aug_20200617_233932-8954cbb7.pth
-- Name: ocrnet_hr48_512x512_20k_voc12aug
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 56.09
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.72
-      mIoU(ms+flip): 79.87
-  Config: configs/ocrnet/ocrnet_hr48_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_20k_voc12aug/ocrnet_hr48_512x512_20k_voc12aug_20200617_233932-9e82080a.pth
-- Name: ocrnet_hr18s_512x512_40k_voc12aug
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18-Small
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 72.76
-      mIoU(ms+flip): 74.6
-  Config: configs/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug/ocrnet_hr18s_512x512_40k_voc12aug_20200614_002025-42b587ac.pth
-- Name: ocrnet_hr18_512x512_40k_voc12aug
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W18
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 74.98
-      mIoU(ms+flip): 77.4
-  Config: configs/ocrnet/ocrnet_hr18_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr18_512x512_40k_voc12aug/ocrnet_hr18_512x512_40k_voc12aug_20200614_015958-714302be.pth
-- Name: ocrnet_hr48_512x512_40k_voc12aug
-  In Collection: OCRNet
-  Metadata:
-    backbone: HRNetV2p-W48
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.14
-      mIoU(ms+flip): 79.71
-  Config: configs/ocrnet/ocrnet_hr48_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/ocrnet/ocrnet_hr48_512x512_40k_voc12aug/ocrnet_hr48_512x512_40k_voc12aug_20200614_015958-255bc5ce.pth
diff --git a/configs/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes.py b/configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ocrnet/ocrnet_hr18_512x1024_160k_cityscapes.py
rename to configs/ocrnet/ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py
diff --git a/configs/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes.py b/configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ocrnet/ocrnet_hr18_512x1024_40k_cityscapes.py
rename to configs/ocrnet/ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes.py b/configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ocrnet/ocrnet_hr18_512x1024_80k_cityscapes.py
rename to configs/ocrnet/ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/ocrnet/ocrnet_hr18_512x512_160k_ade20k.py b/configs/ocrnet/ocrnet_hr18_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/ocrnet/ocrnet_hr18_512x512_160k_ade20k.py
rename to configs/ocrnet/ocrnet_hr18_4xb4-160k_ade20k-512x512.py
diff --git a/configs/ocrnet/ocrnet_hr18_512x512_20k_voc12aug.py b/configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/ocrnet/ocrnet_hr18_512x512_20k_voc12aug.py
rename to configs/ocrnet/ocrnet_hr18_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/ocrnet/ocrnet_hr18_512x512_40k_voc12aug.py b/configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/ocrnet/ocrnet_hr18_512x512_40k_voc12aug.py
rename to configs/ocrnet/ocrnet_hr18_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/ocrnet/ocrnet_hr18_512x512_80k_ade20k.py b/configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/ocrnet/ocrnet_hr18_512x512_80k_ade20k.py
rename to configs/ocrnet/ocrnet_hr18_4xb4-80k_ade20k-512x512.py
diff --git a/configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py b/configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..c5388fb751
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr18s_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py b/configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..2335f3b762
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py b/configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..b2d1a8fa84
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr18s_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_4xb4-160k_ade20k-512x512.py b/configs/ocrnet/ocrnet_hr18s_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..fabf5826cd
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr18s_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py b/configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..0eca655cfc
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr18s_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb4-20k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py b/configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..13b02b9df6
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr18s_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb4-40k_voc12aug-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py b/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..60c79c2dc5
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr18s_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './ocrnet_hr18_4xb4-80k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w18_small',
+    backbone=dict(
+        extra=dict(
+            stage1=dict(num_blocks=(2, )),
+            stage2=dict(num_blocks=(2, 2)),
+            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
+            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes.py b/configs/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes.py
deleted file mode 100644
index fc7909785f..0000000000
--- a/configs/ocrnet/ocrnet_hr18s_512x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './ocrnet_hr18_512x1024_160k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes.py b/configs/ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes.py
deleted file mode 100644
index 923731f74f..0000000000
--- a/configs/ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './ocrnet_hr18_512x1024_40k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes.py b/configs/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes.py
deleted file mode 100644
index be6bf16a2f..0000000000
--- a/configs/ocrnet/ocrnet_hr18s_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './ocrnet_hr18_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_512x512_160k_ade20k.py b/configs/ocrnet/ocrnet_hr18s_512x512_160k_ade20k.py
deleted file mode 100644
index 81f3d5cb91..0000000000
--- a/configs/ocrnet/ocrnet_hr18s_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './ocrnet_hr18_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug.py b/configs/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug.py
deleted file mode 100644
index ceb944815b..0000000000
--- a/configs/ocrnet/ocrnet_hr18s_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './ocrnet_hr18_512x512_20k_voc12aug.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug.py b/configs/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug.py
deleted file mode 100644
index 70babc91c9..0000000000
--- a/configs/ocrnet/ocrnet_hr18s_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './ocrnet_hr18_512x512_40k_voc12aug.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr18s_512x512_80k_ade20k.py b/configs/ocrnet/ocrnet_hr18s_512x512_80k_ade20k.py
deleted file mode 100644
index 36e77219ac..0000000000
--- a/configs/ocrnet/ocrnet_hr18s_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './ocrnet_hr18_512x512_80k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w18_small',
-    backbone=dict(
-        extra=dict(
-            stage1=dict(num_blocks=(2, )),
-            stage2=dict(num_blocks=(2, 2)),
-            stage3=dict(num_modules=3, num_blocks=(2, 2, 2)),
-            stage4=dict(num_modules=2, num_blocks=(2, 2, 2, 2)))))
diff --git a/configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py b/configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..184d38dd2c
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr48_4xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb2-160k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.py b/configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..7025ee9e77
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr48_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb2-40k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py b/configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..9c68a15fc5
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr48_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb2-80k_cityscapes-512x1024.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=19,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py b/configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..e74976c805
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr48_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb4-160k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py b/configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..f015b920e1
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr48_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb4-20k_voc12aug-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=21,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=21,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py b/configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..baafa380d4
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr48_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb4-40k_voc12aug-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=21,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=21,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py b/configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..85514b9d7e
--- /dev/null
+++ b/configs/ocrnet/ocrnet_hr48_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = './ocrnet_hr18_4xb4-80k_ade20k-512x512.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w48',
+    backbone=dict(
+        extra=dict(
+            stage2=dict(num_channels=(48, 96)),
+            stage3=dict(num_channels=(48, 96, 192)),
+            stage4=dict(num_channels=(48, 96, 192, 384)))),
+    decode_head=[
+        dict(
+            type='FCNHead',
+            in_channels=[48, 96, 192, 384],
+            channels=sum([48, 96, 192, 384]),
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            kernel_size=1,
+            num_convs=1,
+            norm_cfg=norm_cfg,
+            concat_input=False,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+        dict(
+            type='OCRHead',
+            in_channels=[48, 96, 192, 384],
+            channels=512,
+            ocr_channels=256,
+            input_transform='resize_concat',
+            in_index=(0, 1, 2, 3),
+            norm_cfg=norm_cfg,
+            dropout_ratio=-1,
+            num_classes=150,
+            align_corners=False,
+            loss_decode=dict(
+                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    ])
diff --git a/configs/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes.py b/configs/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes.py
deleted file mode 100644
index c094391b1d..0000000000
--- a/configs/ocrnet/ocrnet_hr48_512x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = './ocrnet_hr18_512x1024_160k_cityscapes.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=[48, 96, 192, 384],
-            channels=sum([48, 96, 192, 384]),
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            kernel_size=1,
-            num_convs=1,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            dropout_ratio=-1,
-            num_classes=19,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=[48, 96, 192, 384],
-            channels=512,
-            ocr_channels=256,
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            norm_cfg=norm_cfg,
-            dropout_ratio=-1,
-            num_classes=19,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ])
diff --git a/configs/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes.py b/configs/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes.py
deleted file mode 100644
index 0aada9d8dc..0000000000
--- a/configs/ocrnet/ocrnet_hr48_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = './ocrnet_hr18_512x1024_40k_cityscapes.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=[48, 96, 192, 384],
-            channels=sum([48, 96, 192, 384]),
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            kernel_size=1,
-            num_convs=1,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            dropout_ratio=-1,
-            num_classes=19,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=[48, 96, 192, 384],
-            channels=512,
-            ocr_channels=256,
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            norm_cfg=norm_cfg,
-            dropout_ratio=-1,
-            num_classes=19,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ])
diff --git a/configs/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes.py b/configs/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes.py
deleted file mode 100644
index 1b2e009439..0000000000
--- a/configs/ocrnet/ocrnet_hr48_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = './ocrnet_hr18_512x1024_80k_cityscapes.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=[48, 96, 192, 384],
-            channels=sum([48, 96, 192, 384]),
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            kernel_size=1,
-            num_convs=1,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            dropout_ratio=-1,
-            num_classes=19,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=[48, 96, 192, 384],
-            channels=512,
-            ocr_channels=256,
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            norm_cfg=norm_cfg,
-            dropout_ratio=-1,
-            num_classes=19,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ])
diff --git a/configs/ocrnet/ocrnet_hr48_512x512_160k_ade20k.py b/configs/ocrnet/ocrnet_hr48_512x512_160k_ade20k.py
deleted file mode 100644
index 3b3e8af953..0000000000
--- a/configs/ocrnet/ocrnet_hr48_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = './ocrnet_hr18_512x512_160k_ade20k.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=[48, 96, 192, 384],
-            channels=sum([48, 96, 192, 384]),
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            kernel_size=1,
-            num_convs=1,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            dropout_ratio=-1,
-            num_classes=150,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=[48, 96, 192, 384],
-            channels=512,
-            ocr_channels=256,
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            norm_cfg=norm_cfg,
-            dropout_ratio=-1,
-            num_classes=150,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ])
diff --git a/configs/ocrnet/ocrnet_hr48_512x512_20k_voc12aug.py b/configs/ocrnet/ocrnet_hr48_512x512_20k_voc12aug.py
deleted file mode 100644
index c2dd6d1158..0000000000
--- a/configs/ocrnet/ocrnet_hr48_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = './ocrnet_hr18_512x512_20k_voc12aug.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=[48, 96, 192, 384],
-            channels=sum([48, 96, 192, 384]),
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            kernel_size=1,
-            num_convs=1,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            dropout_ratio=-1,
-            num_classes=21,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=[48, 96, 192, 384],
-            channels=512,
-            ocr_channels=256,
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            norm_cfg=norm_cfg,
-            dropout_ratio=-1,
-            num_classes=21,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ])
diff --git a/configs/ocrnet/ocrnet_hr48_512x512_40k_voc12aug.py b/configs/ocrnet/ocrnet_hr48_512x512_40k_voc12aug.py
deleted file mode 100644
index 89e6309f55..0000000000
--- a/configs/ocrnet/ocrnet_hr48_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = './ocrnet_hr18_512x512_40k_voc12aug.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=[48, 96, 192, 384],
-            channels=sum([48, 96, 192, 384]),
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            kernel_size=1,
-            num_convs=1,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            dropout_ratio=-1,
-            num_classes=21,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=[48, 96, 192, 384],
-            channels=512,
-            ocr_channels=256,
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            norm_cfg=norm_cfg,
-            dropout_ratio=-1,
-            num_classes=21,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ])
diff --git a/configs/ocrnet/ocrnet_hr48_512x512_80k_ade20k.py b/configs/ocrnet/ocrnet_hr48_512x512_80k_ade20k.py
deleted file mode 100644
index 04971226eb..0000000000
--- a/configs/ocrnet/ocrnet_hr48_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,39 +0,0 @@
-_base_ = './ocrnet_hr18_512x512_80k_ade20k.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w48',
-    backbone=dict(
-        extra=dict(
-            stage2=dict(num_channels=(48, 96)),
-            stage3=dict(num_channels=(48, 96, 192)),
-            stage4=dict(num_channels=(48, 96, 192, 384)))),
-    decode_head=[
-        dict(
-            type='FCNHead',
-            in_channels=[48, 96, 192, 384],
-            channels=sum([48, 96, 192, 384]),
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            kernel_size=1,
-            num_convs=1,
-            norm_cfg=norm_cfg,
-            concat_input=False,
-            dropout_ratio=-1,
-            num_classes=150,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
-        dict(
-            type='OCRHead',
-            in_channels=[48, 96, 192, 384],
-            channels=512,
-            ocr_channels=256,
-            input_transform='resize_concat',
-            in_index=(0, 1, 2, 3),
-            norm_cfg=norm_cfg,
-            dropout_ratio=-1,
-            num_classes=150,
-            align_corners=False,
-            loss_decode=dict(
-                type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
-    ])
diff --git a/configs/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes.py b/configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ocrnet/ocrnet_r101-d8_512x1024_40k_b8_cityscapes.py
rename to configs/ocrnet/ocrnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes.py b/configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ocrnet/ocrnet_r101-d8_512x1024_40k_b16_cityscapes.py
rename to configs/ocrnet/ocrnet_r101-d8_8xb2-40k_cityscapes-512x1024.py
diff --git a/configs/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes.py b/configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/ocrnet/ocrnet_r101-d8_512x1024_80k_b16_cityscapes.py
rename to configs/ocrnet/ocrnet_r101-d8_8xb2-80k_cityscapes-512x1024.py
diff --git a/configs/pidnet/README.md b/configs/pidnet/README.md
new file mode 100644
index 0000000000..e23efbd3f3
--- /dev/null
+++ b/configs/pidnet/README.md
@@ -0,0 +1,50 @@
+# PIDNet
+
+> [PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller](https://arxiv.org/pdf/2206.02066.pdf)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/XuJiacong/PIDNet">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/pidnet.py">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Two-branch network architecture has shown its efficiency and effectiveness for real-time semantic segmentation tasks. However, direct fusion of low-level details and high-level semantics will lead to a phenomenon that the detailed features are easily overwhelmed by surrounding contextual information, namely overshoot in this paper, which limits the improvement of the accuracy of existed two-branch models. In this paper, we bridge a connection between Convolutional Neural Network (CNN) and Proportional-IntegralDerivative (PID) controller and reveal that the two-branch network is nothing but a Proportional-Integral (PI) controller, which inherently suffers from the similar overshoot issue. To alleviate this issue, we propose a novel threebranch network architecture: PIDNet, which possesses three branches to parse the detailed, context and boundary information (derivative of semantics), respectively, and employs boundary attention to guide the fusion of detailed and context branches in final stage. The family of PIDNets achieve the best trade-off between inference speed and accuracy and their test accuracy surpasses all the existed models with similar inference speed on Cityscapes, CamVid and COCO-Stuff datasets. Especially, PIDNet-S achieves 78.6% mIOU with inference speed of 93.2 FPS on Cityscapes test set and 80.1% mIOU with speed of 153.7 FPS on CamVid test set.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://raw.githubusercontent.com/XuJiacong/PIDNet/main/figs/pidnet.jpg" width="800"/>
+</div>
+
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | -------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PIDNet | PIDNet-S | 1024x1024 | 120000  | 3.38     | 80.82          | A100   | 78.74 | 80.87         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes/pidnet-s_2xb6-120k_1024x1024-cityscapes_20230302_191700-bb8e3bcc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes/pidnet-s_2xb6-120k_1024x1024-cityscapes_20230302_191700.json) |
+| PIDNet | PIDNet-M | 1024x1024 | 120000  | 5.14     | 71.98          | A100   | 80.22 | 82.05         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes/pidnet-m_2xb6-120k_1024x1024-cityscapes_20230301_143452-f9bcdbf3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes/pidnet-m_2xb6-120k_1024x1024-cityscapes_20230301_143452.json) |
+| PIDNet | PIDNet-L | 1024x1024 | 120000  | 5.83     | 60.06          | A100   | 80.89 | 82.37         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes/pidnet-l_2xb6-120k_1024x1024-cityscapes_20230303_114514-0783ca6b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes/pidnet-l_2xb6-120k_1024x1024-cityscapes_20230303_114514.json) |
+
+## Notes
+
+The pretrained weights in config files are converted from [the official repo](https://github.com/XuJiacong/PIDNet#models).
+
+## Citation
+
+```bibtex
+@misc{xu2022pidnet,
+      title={PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller},
+      author={Jiacong Xu and Zixiang Xiong and Shankar P. Bhattacharyya},
+      year={2022},
+      eprint={2206.02066},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+```
diff --git a/configs/pidnet/metafile.yaml b/configs/pidnet/metafile.yaml
new file mode 100644
index 0000000000..51b514a487
--- /dev/null
+++ b/configs/pidnet/metafile.yaml
@@ -0,0 +1,85 @@
+Collections:
+- Name: PIDNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: 'PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller'
+    URL: https://arxiv.org/pdf/2206.02066.pdf
+  README: configs/pidnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: pidnet-s_2xb6-120k_1024x1024-cityscapes
+  In Collection: PIDNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.74
+      mIoU(ms+flip): 80.87
+  Config: configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - PIDNet-S
+    - PIDNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 3.38
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes/pidnet-s_2xb6-120k_1024x1024-cityscapes_20230302_191700-bb8e3bcc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes/pidnet-s_2xb6-120k_1024x1024-cityscapes_20230302_191700.json
+  Paper:
+    Title: 'PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller'
+    URL: https://arxiv.org/pdf/2206.02066.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/pidnet.py
+  Framework: PyTorch
+- Name: pidnet-m_2xb6-120k_1024x1024-cityscapes
+  In Collection: PIDNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.22
+      mIoU(ms+flip): 82.05
+  Config: configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - PIDNet-M
+    - PIDNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 5.14
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes/pidnet-m_2xb6-120k_1024x1024-cityscapes_20230301_143452-f9bcdbf3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes/pidnet-m_2xb6-120k_1024x1024-cityscapes_20230301_143452.json
+  Paper:
+    Title: 'PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller'
+    URL: https://arxiv.org/pdf/2206.02066.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/pidnet.py
+  Framework: PyTorch
+- Name: pidnet-l_2xb6-120k_1024x1024-cityscapes
+  In Collection: PIDNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.89
+      mIoU(ms+flip): 82.37
+  Config: configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 12
+    Architecture:
+    - PIDNet-L
+    - PIDNet
+    Training Resources: 2x A100 GPUS
+    Memory (GB): 5.83
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes/pidnet-l_2xb6-120k_1024x1024-cityscapes_20230303_114514-0783ca6b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes/pidnet-l_2xb6-120k_1024x1024-cityscapes_20230303_114514.json
+  Paper:
+    Title: 'PIDNet: A Real-time Semantic Segmentation Network Inspired from PID Controller'
+    URL: https://arxiv.org/pdf/2206.02066.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/pidnet.py
+  Framework: PyTorch
diff --git a/configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py b/configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py
new file mode 100644
index 0000000000..1955c91e05
--- /dev/null
+++ b/configs/pidnet/pidnet-l_2xb6-120k_1024x1024-cityscapes.py
@@ -0,0 +1,10 @@
+_base_ = './pidnet-s_2xb6-120k_1024x1024-cityscapes.py'
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/pidnet/pidnet-l_imagenet1k_20230306-67889109.pth'  # noqa
+model = dict(
+    backbone=dict(
+        channels=64,
+        ppm_channels=112,
+        num_stem_blocks=3,
+        num_branch_blocks=4,
+        init_cfg=dict(checkpoint=checkpoint_file)),
+    decode_head=dict(in_channels=256, channels=256))
diff --git a/configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py b/configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py
new file mode 100644
index 0000000000..38a69c1c45
--- /dev/null
+++ b/configs/pidnet/pidnet-m_2xb6-120k_1024x1024-cityscapes.py
@@ -0,0 +1,5 @@
+_base_ = './pidnet-s_2xb6-120k_1024x1024-cityscapes.py'
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/pidnet/pidnet-m_imagenet1k_20230306-39893c52.pth'  # noqa
+model = dict(
+    backbone=dict(channels=64, init_cfg=dict(checkpoint=checkpoint_file)),
+    decode_head=dict(in_channels=256))
diff --git a/configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py b/configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py
new file mode 100644
index 0000000000..f70ca4287a
--- /dev/null
+++ b/configs/pidnet/pidnet-s_2xb6-120k_1024x1024-cityscapes.py
@@ -0,0 +1,113 @@
+_base_ = [
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py'
+]
+
+# The class_weight is borrowed from https://github.com/openseg-group/OCNet.pytorch/issues/14 # noqa
+# Licensed under the MIT License
+class_weight = [
+    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754, 1.0489, 0.8786,
+    1.0023, 0.9539, 0.9843, 1.1116, 0.9037, 1.0865, 1.0955, 1.0865, 1.1529,
+    1.0507
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/pidnet/pidnet-s_imagenet1k_20230306-715e6273.pth'  # noqa
+crop_size = (1024, 1024)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='PIDNet',
+        in_channels=3,
+        channels=32,
+        ppm_channels=96,
+        num_stem_blocks=2,
+        num_branch_blocks=3,
+        align_corners=False,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU', inplace=True),
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)),
+    decode_head=dict(
+        type='PIDHead',
+        in_channels=128,
+        channels=128,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU', inplace=True),
+        align_corners=True,
+        loss_decode=[
+            dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                class_weight=class_weight,
+                loss_weight=0.4),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=1.0),
+            dict(type='BoundaryLoss', loss_weight=20.0),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=class_weight,
+                loss_weight=1.0)
+        ]),
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='GenerateEdge', edge_width=4),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(batch_size=6, dataset=dict(pipeline=train_pipeline))
+
+iters = 120000
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0,
+        power=0.9,
+        begin=0,
+        end=iters,
+        by_epoch=False)
+]
+# training schedule for 120k
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=iters, val_interval=iters // 10)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(
+        type='CheckpointHook', by_epoch=False, interval=iters // 10),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+
+randomness = dict(seed=304)
diff --git a/configs/point_rend/README.md b/configs/point_rend/README.md
index 2644f46c6b..487d3bcc7f 100644
--- a/configs/point_rend/README.md
+++ b/configs/point_rend/README.md
@@ -1,6 +1,6 @@
 # PointRend
 
-[PointRend: Image Segmentation as Rendering](https://arxiv.org/abs/1912.08193)
+> [PointRend: Image Segmentation as Rendering](https://arxiv.org/abs/1912.08193)
 
 ## Introduction
 
@@ -22,6 +22,22 @@ We present a new method for efficient high-quality image segmentation of objects
 <img src="https://user-images.githubusercontent.com/24582831/142902293-5db49cdd-4b1b-4940-9067-2acd6196c700.png" width="60%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                         |
+| --------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PointRend | R-50     | 512x1024  |   80000 |      3.1 | 8.48           | V100   | 76.47 | 78.13         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes_20200711_015821-bb1ff523.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes-20200715_214714.log.json)     |
+| PointRend | R-101    | 512x1024  |   80000 |      4.2 | 7.00           | V100   | 78.30 | 79.97         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes_20200711_170850-d0ca84be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes-20200715_214824.log.json) |
+
+### ADE20K
+
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                         |
+| --------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PointRend | R-50     | 512x512   |  160000 |      5.1 | 17.31          | V100   | 37.64 | 39.17         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k_20200807_232644-ac3febf2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k-20200807_232644.log.json)     |
+| PointRend | R-101    | 512x512   |  160000 |      6.1 | 15.50          | V100   | 40.02 | 41.60         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k_20200808_030852-8834902a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k-20200808_030852.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,19 +49,3 @@ We present a new method for efficient high-quality image segmentation of objects
   year={2020}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                         |
-| --------- | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PointRend | R-50     | 512x1024  |   80000 |      3.1 | 8.48           | 76.47 | 78.13         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/point_rend/pointrend_r50_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes_20200711_015821-bb1ff523.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes-20200715_214714.log.json)     |
-| PointRend | R-101    | 512x1024  |   80000 |      4.2 | 7.00           | 78.30 | 79.97         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/point_rend/pointrend_r101_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes_20200711_170850-d0ca84be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes-20200715_214824.log.json) |
-
-### ADE20K
-
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                         |
-| --------- | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| PointRend | R-50     | 512x512   |  160000 |      5.1 | 17.31          | 37.64 | 39.17         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/point_rend/pointrend_r50_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k_20200807_232644-ac3febf2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k-20200807_232644.log.json)     |
-| PointRend | R-101    | 512x512   |  160000 |      6.1 | 15.50          | 40.02 | 41.60         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/point_rend/pointrend_r101_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k_20200808_030852-8834902a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k-20200808_030852.log.json) |
diff --git a/configs/point_rend/metafile.yaml b/configs/point_rend/metafile.yaml
new file mode 100644
index 0000000000..064717c9df
--- /dev/null
+++ b/configs/point_rend/metafile.yaml
@@ -0,0 +1,110 @@
+Collections:
+- Name: PointRend
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  README: configs/point_rend/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: pointrend_r50_4xb2-80k_cityscapes-512x1024
+  In Collection: PointRend
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.47
+      mIoU(ms+flip): 78.13
+  Config: configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - PointRend
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes_20200711_015821-bb1ff523.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes-20200715_214714.log.json
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
+  Framework: PyTorch
+- Name: pointrend_r101_4xb2-80k_cityscapes-512x1024
+  In Collection: PointRend
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.3
+      mIoU(ms+flip): 79.97
+  Config: configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - PointRend
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes_20200711_170850-d0ca84be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes-20200715_214824.log.json
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
+  Framework: PyTorch
+- Name: pointrend_r50_4xb4-160k_ade20k-512x512
+  In Collection: PointRend
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.64
+      mIoU(ms+flip): 39.17
+  Config: configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - PointRend
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k_20200807_232644-ac3febf2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k-20200807_232644.log.json
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
+  Framework: PyTorch
+- Name: pointrend_r101_4xb4-160k_ade20k-512x512
+  In Collection: PointRend
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.02
+      mIoU(ms+flip): 41.6
+  Config: configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - PointRend
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k_20200808_030852-8834902a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k-20200808_030852.log.json
+  Paper:
+    Title: 'PointRend: Image Segmentation as Rendering'
+    URL: https://arxiv.org/abs/1912.08193
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
+  Framework: PyTorch
diff --git a/configs/point_rend/point_rend.yml b/configs/point_rend/point_rend.yml
deleted file mode 100644
index 3abe81d7d6..0000000000
--- a/configs/point_rend/point_rend.yml
+++ /dev/null
@@ -1,104 +0,0 @@
-Collections:
-- Name: PointRend
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-  Paper:
-    URL: https://arxiv.org/abs/1912.08193
-    Title: 'PointRend: Image Segmentation as Rendering'
-  README: configs/point_rend/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/point_head.py#L36
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/facebookresearch/detectron2/tree/master/projects/PointRend
-Models:
-- Name: pointrend_r50_512x1024_80k_cityscapes
-  In Collection: PointRend
-  Metadata:
-    backbone: R-50
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 117.92
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.47
-      mIoU(ms+flip): 78.13
-  Config: configs/point_rend/pointrend_r50_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x1024_80k_cityscapes/pointrend_r50_512x1024_80k_cityscapes_20200711_015821-bb1ff523.pth
-- Name: pointrend_r101_512x1024_80k_cityscapes
-  In Collection: PointRend
-  Metadata:
-    backbone: R-101
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 142.86
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 4.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.3
-      mIoU(ms+flip): 79.97
-  Config: configs/point_rend/pointrend_r101_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x1024_80k_cityscapes/pointrend_r101_512x1024_80k_cityscapes_20200711_170850-d0ca84be.pth
-- Name: pointrend_r50_512x512_160k_ade20k
-  In Collection: PointRend
-  Metadata:
-    backbone: R-50
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 57.77
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 37.64
-      mIoU(ms+flip): 39.17
-  Config: configs/point_rend/pointrend_r50_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r50_512x512_160k_ade20k/pointrend_r50_512x512_160k_ade20k_20200807_232644-ac3febf2.pth
-- Name: pointrend_r101_512x512_160k_ade20k
-  In Collection: PointRend
-  Metadata:
-    backbone: R-101
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 64.52
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 40.02
-      mIoU(ms+flip): 41.6
-  Config: configs/point_rend/pointrend_r101_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/point_rend/pointrend_r101_512x512_160k_ade20k/pointrend_r101_512x512_160k_ade20k_20200808_030852-8834902a.pth
diff --git a/configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py b/configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..ca2a19a196
--- /dev/null
+++ b/configs/point_rend/pointrend_r101_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './pointrend_r50_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py b/configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..6729d3b672
--- /dev/null
+++ b/configs/point_rend/pointrend_r101_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pointrend_r50_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/point_rend/pointrend_r101_512x1024_80k_cityscapes.py b/configs/point_rend/pointrend_r101_512x1024_80k_cityscapes.py
deleted file mode 100644
index a8c14c8cf9..0000000000
--- a/configs/point_rend/pointrend_r101_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pointrend_r50_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/point_rend/pointrend_r101_512x512_160k_ade20k.py b/configs/point_rend/pointrend_r101_512x512_160k_ade20k.py
deleted file mode 100644
index 4d1f8c8154..0000000000
--- a/configs/point_rend/pointrend_r101_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pointrend_r50_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/point_rend/pointrend_r50_512x1024_80k_cityscapes.py b/configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/point_rend/pointrend_r50_512x1024_80k_cityscapes.py
rename to configs/point_rend/pointrend_r50_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/point_rend/pointrend_r50_512x512_160k_ade20k.py b/configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/point_rend/pointrend_r50_512x512_160k_ade20k.py
rename to configs/point_rend/pointrend_r50_4xb4-160k_ade20k-512x512.py
diff --git a/configs/poolformer/README.md b/configs/poolformer/README.md
new file mode 100644
index 0000000000..e6e2eac210
--- /dev/null
+++ b/configs/poolformer/README.md
@@ -0,0 +1,65 @@
+# PoolFormer
+
+> [MetaFormer is Actually What You Need for Vision](https://arxiv.org/abs/2111.11418)
+
+## Introduction
+
+<!-- [BACKBONE] -->
+
+<a href="https://github.com/sail-sg/poolformer/tree/main/segmentation">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Transformers have shown great potential in computer vision tasks. A common belief is their attention-based token mixer module contributes most to their competence. However, recent works show the attention-based module in transformers can be replaced by spatial MLPs and the resulted models still perform quite well. Based on this observation, we hypothesize that the general architecture of the transformers, instead of the specific token mixer module, is more essential to the model's performance. To verify this, we deliberately replace the attention module in transformers with an embarrassingly simple spatial pooling operator to conduct only the most basic token mixing. Surprisingly, we observe that the derived model, termed as PoolFormer, achieves competitive performance on multiple computer vision tasks. For example, on ImageNet-1K, PoolFormer achieves 82.1% top-1 accuracy, surpassing well-tuned vision transformer/MLP-like baselines DeiT-B/ResMLP-B24 by 0.3%/1.1% accuracy with 35%/52% fewer parameters and 48%/60% fewer MACs. The effectiveness of PoolFormer verifies our hypothesis and urges us to initiate the concept of "MetaFormer", a general architecture abstracted from transformers without specifying the token mixer. Based on the extensive experiments, we argue that MetaFormer is the key player in achieving superior results for recent transformer and MLP-like models on vision tasks. This work calls for more future research dedicated to improving MetaFormer instead of focusing on the token mixer modules. Additionally, our proposed PoolFormer could serve as a starting baseline for future MetaFormer architecture design. Code is available at [this https URL](https://github.com/sail-sg/poolformer)
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/15921929/144710761-1635f59a-abde-4946-984c-a2c3f22a19d2.png" width="70%"/>
+</div>
+
+## Citation
+
+```bibtex
+@inproceedings{yu2022metaformer,
+  title={Metaformer is actually what you need for vision},
+  author={Yu, Weihao and Luo, Mi and Zhou, Pan and Si, Chenyang and Zhou, Yichen and Wang, Xinchao and Feng, Jiashi and Yan, Shuicheng},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={10819--10829},
+  year={2022}
+}
+```
+
+### Usage
+
+- PoolFormer backbone needs to install [MMClassification](https://github.com/open-mmlab/mmclassification) first, which has abundant backbones for downstream tasks.
+
+```shell
+pip install "mmpretrain>=1.0.0rc7"
+```
+
+- The pretrained models could also be downloaded from [PoolFormer config of MMClassification](https://github.com/open-mmlab/mmclassification/tree/master/configs/poolformer).
+
+## Results and models
+
+### ADE20K
+
+| Method | Backbone       | Crop Size | pretrain    | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | mIoU\* | mIoU\*(ms+flip) | config                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------------- | --------- | ----------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ------ | --------------: | --------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FPN    | PoolFormer-S12 | 512x512   | ImageNet-1K | 32         | 40000   | 4.17     | 23.48          | V100   | 36.68 |             - | 37.07  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s12_8x4_512x512_40k_ade20k/fpn_poolformer_s12_8x4_512x512_40k_ade20k_20220501_115154-b5aa2f49.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s12_8x4_512x512_40k_ade20k/fpn_poolformer_s12_8x4_512x512_40k_ade20k_20220501_115154.log.json) |
+| FPN    | PoolFormer-S24 | 512x512   | ImageNet-1K | 32         | 40000   | 5.47     | 15.74          | V100   | 40.12 |             - | 40.36  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s24_8x4_512x512_40k_ade20k/fpn_poolformer_s24_8x4_512x512_40k_ade20k_20220503_222049-394a7cf7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s24_8x4_512x512_40k_ade20k/fpn_poolformer_s24_8x4_512x512_40k_ade20k_20220503_222049.log.json) |
+| FPN    | PoolFormer-S36 | 512x512   | ImageNet-1K | 32         | 40000   | 6.77     | 11.34          | V100   | 41.61 |             - | 41.81  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_s36_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k/fpn_poolformer_s36_8x4_512x512_40k_ade20k_20220501_151122-b47e607d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k/fpn_poolformer_s36_8x4_512x512_40k_ade20k_20220501_151122.log.json) |
+| FPN    | PoolFormer-M36 | 512x512   | ImageNet-1K | 32         | 40000   | 8.59     | 8.97           | V100   | 41.95 |             - | 42.35  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m36_8x4_512x512_40k_ade20k/fpn_poolformer_m36_8x4_512x512_40k_ade20k_20220501_164230-3dc83921.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m36_8x4_512x512_40k_ade20k/fpn_poolformer_m36_8x4_512x512_40k_ade20k_20220501_164230.log.json) |
+| FPN    | PoolFormer-M48 | 512x512   | ImageNet-1K | 32         | 40000   | 10.48    | 6.69           | V100   | 42.43 |             - | 42.76  |               - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m48_8x4_512x512_40k_ade20k/fpn_poolformer_m48_8x4_512x512_40k_ade20k_20220504_003923-64168d3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m48_8x4_512x512_40k_ade20k/fpn_poolformer_m48_8x4_512x512_40k_ade20k_20220504_003923.log.json) |
+
+Note:
+
+- We replace `AlignedResize` in original PoolFormer implementation to `Resize + ResizeToMultiple`.
+
+- `mIoU` with * is collected when `Resize + ResizeToMultiple` is adopted in `test_pipeline`, so do `mIoU` in logs.
+
+- The Test Time Augmentation i.e., "ms+flip" in MMSegmentation v1.x is developing, stay tuned!
diff --git a/configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py b/configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000..4100eb9923
--- /dev/null
+++ b/configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py'
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m36_3rdparty_32xb128_in1k_20220414-c55e0949.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        arch='m36',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[96, 192, 384, 768]))
diff --git a/configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py b/configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000..cfc49ccbdb
--- /dev/null
+++ b/configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py'
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-m48_3rdparty_32xb128_in1k_20220414-9378f3eb.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        arch='m48',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')),
+    neck=dict(in_channels=[96, 192, 384, 768]))
diff --git a/configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py b/configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000..c0b15312fe
--- /dev/null
+++ b/configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,91 @@
+_base_ = [
+    '../_base_/models/fpn_poolformer_s12.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    dict(type='ResizeToMultiple', size_divisor=32),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type='RepeatDataset',
+        times=50,
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            data_prefix=dict(
+                img_path='images/training',
+                seg_map_path='annotations/training'),
+            pipeline=train_pipeline)))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
+
+# model settings
+model = dict(
+    data_preprocessor=data_preprocessor,
+    neck=dict(in_channels=[64, 128, 320, 512]),
+    decode_head=dict(num_classes=150))
+
+# optimizer
+# optimizer = dict(_delete_=True, type='AdamW', lr=0.0002, weight_decay=0.0001)
+# optimizer_config = dict()
+# # learning policy
+# lr_config = dict(policy='poly', power=0.9, min_lr=0.0, by_epoch=False)
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0002, weight_decay=0.0001))
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        power=0.9,
+        begin=0,
+        end=40000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
diff --git a/configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py b/configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000..1f9d24cd41
--- /dev/null
+++ b/configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py'
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s24_3rdparty_32xb128_in1k_20220414-d7055904.pth'  # noqa
+# model settings
+model = dict(
+    backbone=dict(
+        arch='s24',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')))
diff --git a/configs/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k.py b/configs/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k.py
new file mode 100644
index 0000000000..231dcf6c20
--- /dev/null
+++ b/configs/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k.py
@@ -0,0 +1,10 @@
+_base_ = './fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py'
+checkpoint_file = 'https://download.openmmlab.com/mmclassification/v0/poolformer/poolformer-s36_3rdparty_32xb128_in1k_20220414-d78ff3e8.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        arch='s36',
+        init_cfg=dict(
+            type='Pretrained', checkpoint=checkpoint_file,
+            prefix='backbone.')))
diff --git a/configs/poolformer/metafile.yaml b/configs/poolformer/metafile.yaml
new file mode 100644
index 0000000000..12f402be65
--- /dev/null
+++ b/configs/poolformer/metafile.yaml
@@ -0,0 +1,116 @@
+Models:
+- Name: fpn_poolformer_s12_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 36.68
+  Config: configs/poolformer/fpn_poolformer_s12_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-S12
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.17
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s12_8x4_512x512_40k_ade20k/fpn_poolformer_s12_8x4_512x512_40k_ade20k_20220501_115154-b5aa2f49.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s12_8x4_512x512_40k_ade20k/fpn_poolformer_s12_8x4_512x512_40k_ade20k_20220501_115154.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
+- Name: fpn_poolformer_s24_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.12
+  Config: configs/poolformer/fpn_poolformer_s24_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-S24
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.47
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s24_8x4_512x512_40k_ade20k/fpn_poolformer_s24_8x4_512x512_40k_ade20k_20220503_222049-394a7cf7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s24_8x4_512x512_40k_ade20k/fpn_poolformer_s24_8x4_512x512_40k_ade20k_20220503_222049.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
+- Name: fpn_poolformer_s36_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.61
+  Config: configs/poolformer/fpn_poolformer_s36_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-S36
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.77
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k/fpn_poolformer_s36_8x4_512x512_40k_ade20k_20220501_151122-b47e607d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_s36_8x4_512x512_40k_ade20k/fpn_poolformer_s36_8x4_512x512_40k_ade20k_20220501_151122.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
+- Name: fpn_poolformer_m36_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.95
+  Config: configs/poolformer/fpn_poolformer_m36_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-M36
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.59
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m36_8x4_512x512_40k_ade20k/fpn_poolformer_m36_8x4_512x512_40k_ade20k_20220501_164230-3dc83921.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m36_8x4_512x512_40k_ade20k/fpn_poolformer_m36_8x4_512x512_40k_ade20k_20220501_164230.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
+- Name: fpn_poolformer_m48_8xb4-40k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.43
+  Config: configs/poolformer/fpn_poolformer_m48_8xb4-40k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - PoolFormer-M48
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 10.48
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m48_8x4_512x512_40k_ade20k/fpn_poolformer_m48_8x4_512x512_40k_ade20k_20220504_003923-64168d3b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/poolformer/fpn_poolformer_m48_8x4_512x512_40k_ade20k/fpn_poolformer_m48_8x4_512x512_40k_ade20k_20220504_003923.log.json
+  Paper:
+    Title: MetaFormer is Actually What You Need for Vision
+    URL: https://arxiv.org/abs/2111.11418
+  Code: https://github.com/open-mmlab/mmclassification/blob/v0.23.0/mmcls/models/backbones/poolformer.py#L198
+  Framework: PyTorch
diff --git a/configs/psanet/README.md b/configs/psanet/README.md
index 9f307b2d29..1f5680fbab 100644
--- a/configs/psanet/README.md
+++ b/configs/psanet/README.md
@@ -1,6 +1,6 @@
 # PSANet
 
-[PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf)
+> [PSANet: Point-wise Spatial Attention Network for Scene Parsing](https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf)
 
 ## Introduction
 
@@ -22,6 +22,39 @@ We notice information flow in convolutional neural networksis  restricted  insid
 <img src="https://user-images.githubusercontent.com/24582831/142902367-0f29e8cb-5ac0-434b-98c4-b2af7c9c2e58.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSANet | R-50-D8  | 512x1024  |   40000 | 7        | 3.17           | V100   | 77.63 |         79.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117-99fac37c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117.log.json)     |
+| PSANet | R-101-D8 | 512x1024  |   40000 | 10.5     | 2.20           | V100   | 79.14 |         80.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418-27b9cfa7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418.log.json) |
+| PSANet | R-50-D8  | 769x769   |   40000 | 7.9      | 1.40           | V100   | 77.99 |         79.64 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717-d5365506.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717.log.json)         |
+| PSANet | R-101-D8 | 769x769   |   40000 | 11.9     | 0.98           | V100   | 78.43 |         80.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107-997da1e6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107.log.json)     |
+| PSANet | R-50-D8  | 512x1024  |   80000 | -        | -              | V100   | 77.24 |         78.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842-ab60a24f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842.log.json)     |
+| PSANet | R-101-D8 | 512x1024  |   80000 | -        | -              | V100   | 79.31 |         80.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823-0f73a169.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823.log.json) |
+| PSANet | R-50-D8  | 769x769   |   80000 | -        | -              | V100   | 79.31 |         80.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134-fe42f49e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134.log.json)         |
+| PSANet | R-101-D8 | 769x769   |   80000 | -        | -              | V100   | 79.69 |         80.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550-7665827b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550.log.json)     |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSANet | R-50-D8  | 512x512   |   80000 | 9        | 18.91          | V100   | 41.14 |         41.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141-835e4b97.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141.log.json)         |
+| PSANet | R-101-D8 | 512x512   |   80000 | 12.5     | 13.13          | V100   | 43.80 |         44.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117-1fab60d4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117.log.json)     |
+| PSANet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 41.67 |         42.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258-148077dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258.log.json)     |
+| PSANet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 43.74 |         45.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537-dbfa564c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSANet | R-50-D8  | 512x512   |   20000 | 6.9      | 18.24          | V100   | 76.39 |         77.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413-2f1bbaa1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413.log.json)     |
+| PSANet | R-101-D8 | 512x512   |   20000 | 10.4     | 12.63          | V100   | 77.91 |         79.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624-946fef11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624.log.json) |
+| PSANet | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 76.30 |         77.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946-f596afb5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946.log.json)     |
+| PSANet | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 77.73 |         79.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946-1f560f9e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,36 +66,3 @@ We notice information flow in convolutional neural networksis  restricted  insid
   year={2018}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSANet | R-50-D8  | 512x1024  |   40000 | 7        | 3.17           | 77.63 |         79.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r50-d8_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117-99fac37c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117.log.json)     |
-| PSANet | R-101-D8 | 512x1024  |   40000 | 10.5     | 2.20           | 79.14 |         80.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r101-d8_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418-27b9cfa7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418.log.json) |
-| PSANet | R-50-D8  | 769x769   |   40000 | 7.9      | 1.40           | 77.99 |         79.64 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r50-d8_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717-d5365506.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717.log.json)         |
-| PSANet | R-101-D8 | 769x769   |   40000 | 11.9     | 0.98           | 78.43 |         80.26 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r101-d8_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107-997da1e6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107.log.json)     |
-| PSANet | R-50-D8  | 512x1024  |   80000 | -        | -              | 77.24 |         78.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r50-d8_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842-ab60a24f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842.log.json)     |
-| PSANet | R-101-D8 | 512x1024  |   80000 | -        | -              | 79.31 |         80.53 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823-0f73a169.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823.log.json) |
-| PSANet | R-50-D8  | 769x769   |   80000 | -        | -              | 79.31 |         80.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r50-d8_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134-fe42f49e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134.log.json)         |
-| PSANet | R-101-D8 | 769x769   |   80000 | -        | -              | 79.69 |         80.89 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r101-d8_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550-7665827b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550.log.json)     |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSANet | R-50-D8  | 512x512   |   80000 | 9        | 18.91          | 41.14 |         41.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141-835e4b97.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141.log.json)         |
-| PSANet | R-101-D8 | 512x512   |   80000 | 12.5     | 13.13          | 43.80 |         44.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117-1fab60d4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117.log.json)     |
-| PSANet | R-50-D8  | 512x512   |  160000 | -        | -              | 41.67 |         42.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258-148077dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258.log.json)     |
-| PSANet | R-101-D8 | 512x512   |  160000 | -        | -              | 43.74 |         45.38 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537-dbfa564c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537.log.json) |
-
-### Pascal VOC 2012 + Aug
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSANet | R-50-D8  | 512x512   |   20000 | 6.9      | 18.24          | 76.39 |         77.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413-2f1bbaa1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413.log.json)     |
-| PSANet | R-101-D8 | 512x512   |   20000 | 10.4     | 12.63          | 77.91 |         79.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624-946fef11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624.log.json) |
-| PSANet | R-50-D8  | 512x512   |   40000 | -        | -              | 76.30 |         77.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946-f596afb5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946.log.json)     |
-| PSANet | R-101-D8 | 512x512   |   40000 | -        | -              | 77.73 |         79.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet/psanet_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946-1f560f9e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946.log.json) |
diff --git a/configs/psanet/metafile.yaml b/configs/psanet/metafile.yaml
new file mode 100644
index 0000000000..3fbe6f6d3e
--- /dev/null
+++ b/configs/psanet/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: PSANet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  README: configs/psanet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: psanet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.63
+      mIoU(ms+flip): 79.04
+  Config: configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117-99fac37c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.14
+      mIoU(ms+flip): 80.19
+  Config: configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418-27b9cfa7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.99
+      mIoU(ms+flip): 79.64
+  Config: configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717-d5365506.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.43
+      mIoU(ms+flip): 80.26
+  Config: configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107-997da1e6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.24
+      mIoU(ms+flip): 78.69
+  Config: configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842-ab60a24f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.31
+      mIoU(ms+flip): 80.53
+  Config: configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823-0f73a169.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.31
+      mIoU(ms+flip): 80.91
+  Config: configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134-fe42f49e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.69
+      mIoU(ms+flip): 80.89
+  Config: configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550-7665827b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.14
+      mIoU(ms+flip): 41.91
+  Config: configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141-835e4b97.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.8
+      mIoU(ms+flip): 44.75
+  Config: configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117-1fab60d4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.67
+      mIoU(ms+flip): 42.95
+  Config: configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258-148077dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.74
+      mIoU(ms+flip): 45.38
+  Config: configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537-dbfa564c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.39
+      mIoU(ms+flip): 77.34
+  Config: configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413-2f1bbaa1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.91
+      mIoU(ms+flip): 79.3
+  Config: configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624-946fef11.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.3
+      mIoU(ms+flip): 77.35
+  Config: configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946-f596afb5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
+- Name: psanet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: PSANet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.73
+      mIoU(ms+flip): 79.05
+  Config: configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSANet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946-1f560f9e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946.log.json
+  Paper:
+    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
+    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
+  Framework: PyTorch
diff --git a/configs/psanet/psanet.yml b/configs/psanet/psanet.yml
deleted file mode 100644
index 353c890c8f..0000000000
--- a/configs/psanet/psanet.yml
+++ /dev/null
@@ -1,305 +0,0 @@
-Collections:
-- Name: PSANet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://openaccess.thecvf.com/content_ECCV_2018/papers/Hengshuang_Zhao_PSANet_Point-wise_Spatial_ECCV_2018_paper.pdf
-    Title: 'PSANet: Point-wise Spatial Attention Network for Scene Parsing'
-  README: configs/psanet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psa_head.py#L18
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/hszhao/PSANet
-Models:
-- Name: psanet_r50-d8_512x1024_40k_cityscapes
-  In Collection: PSANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 315.46
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.63
-      mIoU(ms+flip): 79.04
-  Config: configs/psanet/psanet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_40k_cityscapes/psanet_r50-d8_512x1024_40k_cityscapes_20200606_103117-99fac37c.pth
-- Name: psanet_r101-d8_512x1024_40k_cityscapes
-  In Collection: PSANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 454.55
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 10.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.14
-      mIoU(ms+flip): 80.19
-  Config: configs/psanet/psanet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_40k_cityscapes/psanet_r101-d8_512x1024_40k_cityscapes_20200606_001418-27b9cfa7.pth
-- Name: psanet_r50-d8_769x769_40k_cityscapes
-  In Collection: PSANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 714.29
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 7.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.99
-      mIoU(ms+flip): 79.64
-  Config: configs/psanet/psanet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_40k_cityscapes/psanet_r50-d8_769x769_40k_cityscapes_20200530_033717-d5365506.pth
-- Name: psanet_r101-d8_769x769_40k_cityscapes
-  In Collection: PSANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 1020.41
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 11.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.43
-      mIoU(ms+flip): 80.26
-  Config: configs/psanet/psanet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_40k_cityscapes/psanet_r101-d8_769x769_40k_cityscapes_20200530_035107-997da1e6.pth
-- Name: psanet_r50-d8_512x1024_80k_cityscapes
-  In Collection: PSANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.24
-      mIoU(ms+flip): 78.69
-  Config: configs/psanet/psanet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x1024_80k_cityscapes/psanet_r50-d8_512x1024_80k_cityscapes_20200606_161842-ab60a24f.pth
-- Name: psanet_r101-d8_512x1024_80k_cityscapes
-  In Collection: PSANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.31
-      mIoU(ms+flip): 80.53
-  Config: configs/psanet/psanet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x1024_80k_cityscapes/psanet_r101-d8_512x1024_80k_cityscapes_20200606_161823-0f73a169.pth
-- Name: psanet_r50-d8_769x769_80k_cityscapes
-  In Collection: PSANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.31
-      mIoU(ms+flip): 80.91
-  Config: configs/psanet/psanet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_769x769_80k_cityscapes/psanet_r50-d8_769x769_80k_cityscapes_20200606_225134-fe42f49e.pth
-- Name: psanet_r101-d8_769x769_80k_cityscapes
-  In Collection: PSANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.69
-      mIoU(ms+flip): 80.89
-  Config: configs/psanet/psanet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_769x769_80k_cityscapes/psanet_r101-d8_769x769_80k_cityscapes_20200606_214550-7665827b.pth
-- Name: psanet_r50-d8_512x512_80k_ade20k
-  In Collection: PSANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 52.88
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.14
-      mIoU(ms+flip): 41.91
-  Config: configs/psanet/psanet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_80k_ade20k/psanet_r50-d8_512x512_80k_ade20k_20200614_144141-835e4b97.pth
-- Name: psanet_r101-d8_512x512_80k_ade20k
-  In Collection: PSANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 76.16
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.8
-      mIoU(ms+flip): 44.75
-  Config: configs/psanet/psanet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_80k_ade20k/psanet_r101-d8_512x512_80k_ade20k_20200614_185117-1fab60d4.pth
-- Name: psanet_r50-d8_512x512_160k_ade20k
-  In Collection: PSANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.67
-      mIoU(ms+flip): 42.95
-  Config: configs/psanet/psanet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_160k_ade20k/psanet_r50-d8_512x512_160k_ade20k_20200615_161258-148077dd.pth
-- Name: psanet_r101-d8_512x512_160k_ade20k
-  In Collection: PSANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.74
-      mIoU(ms+flip): 45.38
-  Config: configs/psanet/psanet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_160k_ade20k/psanet_r101-d8_512x512_160k_ade20k_20200615_161537-dbfa564c.pth
-- Name: psanet_r50-d8_512x512_20k_voc12aug
-  In Collection: PSANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 54.82
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.39
-      mIoU(ms+flip): 77.34
-  Config: configs/psanet/psanet_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_20k_voc12aug/psanet_r50-d8_512x512_20k_voc12aug_20200617_102413-2f1bbaa1.pth
-- Name: psanet_r101-d8_512x512_20k_voc12aug
-  In Collection: PSANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 79.18
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.91
-      mIoU(ms+flip): 79.3
-  Config: configs/psanet/psanet_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_20k_voc12aug/psanet_r101-d8_512x512_20k_voc12aug_20200617_110624-946fef11.pth
-- Name: psanet_r50-d8_512x512_40k_voc12aug
-  In Collection: PSANet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.3
-      mIoU(ms+flip): 77.35
-  Config: configs/psanet/psanet_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r50-d8_512x512_40k_voc12aug/psanet_r50-d8_512x512_40k_voc12aug_20200613_161946-f596afb5.pth
-- Name: psanet_r101-d8_512x512_40k_voc12aug
-  In Collection: PSANet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.73
-      mIoU(ms+flip): 79.05
-  Config: configs/psanet/psanet_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/psanet/psanet_r101-d8_512x512_40k_voc12aug/psanet_r101-d8_512x512_40k_voc12aug_20200613_161946-1f560f9e.pth
diff --git a/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..e69cf42703
--- /dev/null
+++ b/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..e543099842
--- /dev/null
+++ b/configs/psanet/psanet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..b8636384d0
--- /dev/null
+++ b/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..097b1c58ce
--- /dev/null
+++ b/configs/psanet/psanet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..ac86306cb6
--- /dev/null
+++ b/configs/psanet/psanet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..abd8e56512
--- /dev/null
+++ b/configs/psanet/psanet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..d3154a8f14
--- /dev/null
+++ b/configs/psanet/psanet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..b34d4248e8
--- /dev/null
+++ b/configs/psanet/psanet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './psanet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_512x1024_40k_cityscapes.py b/configs/psanet/psanet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index 69d212f158..0000000000
--- a/configs/psanet/psanet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './psanet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_512x1024_80k_cityscapes.py b/configs/psanet/psanet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index bc25d6aaf6..0000000000
--- a/configs/psanet/psanet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './psanet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_512x512_160k_ade20k.py b/configs/psanet/psanet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 7f6795e5ef..0000000000
--- a/configs/psanet/psanet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './psanet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_512x512_20k_voc12aug.py b/configs/psanet/psanet_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 1a3c43495b..0000000000
--- a/configs/psanet/psanet_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './psanet_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_512x512_40k_voc12aug.py b/configs/psanet/psanet_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index f62eef9773..0000000000
--- a/configs/psanet/psanet_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './psanet_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_512x512_80k_ade20k.py b/configs/psanet/psanet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index f8865a7c4d..0000000000
--- a/configs/psanet/psanet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './psanet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_769x769_40k_cityscapes.py b/configs/psanet/psanet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index ffc99f0109..0000000000
--- a/configs/psanet/psanet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './psanet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r101-d8_769x769_80k_cityscapes.py b/configs/psanet/psanet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 6a9efc55ad..0000000000
--- a/configs/psanet/psanet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './psanet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/psanet/psanet_r50-d8_512x1024_40k_cityscapes.py b/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/psanet/psanet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/psanet/psanet_r50-d8_769x769_40k_cityscapes.py b/configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/psanet/psanet_r50-d8_769x769_40k_cityscapes.py
rename to configs/psanet/psanet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/psanet/psanet_r50-d8_512x1024_80k_cityscapes.py b/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/psanet/psanet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/psanet/psanet_r50-d8_769x769_80k_cityscapes.py b/configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/psanet/psanet_r50-d8_769x769_80k_cityscapes.py
rename to configs/psanet/psanet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/psanet/psanet_r50-d8_512x512_160k_ade20k.py b/configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/psanet/psanet_r50-d8_512x512_160k_ade20k.py
rename to configs/psanet/psanet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/psanet/psanet_r50-d8_512x512_20k_voc12aug.py b/configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/psanet/psanet_r50-d8_512x512_20k_voc12aug.py
rename to configs/psanet/psanet_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/psanet/psanet_r50-d8_512x512_40k_voc12aug.py b/configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/psanet/psanet_r50-d8_512x512_40k_voc12aug.py
rename to configs/psanet/psanet_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/psanet/psanet_r50-d8_512x512_80k_ade20k.py b/configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/psanet/psanet_r50-d8_512x512_80k_ade20k.py
rename to configs/psanet/psanet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/pspnet/README.md b/configs/pspnet/README.md
index 83da76bc34..4209d259b7 100644
--- a/configs/pspnet/README.md
+++ b/configs/pspnet/README.md
@@ -1,6 +1,6 @@
 # PSPNet
 
-[Pyramid Scene Parsing Network](https://arxiv.org/abs/1612.01105)
+> [Pyramid Scene Parsing Network](https://arxiv.org/abs/1612.01105)
 
 ## Introduction
 
@@ -22,152 +22,137 @@ Scene parsing is challenging for unrestricted open vocabulary and diverse scenes
 <img src="https://user-images.githubusercontent.com/24582831/142902444-9f93b99e-9261-443b-a0a4-17e78eefb525.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@inproceedings{zhao2017pspnet,
-  title={Pyramid Scene Parsing Network},
-  author={Zhao, Hengshuang and Shi, Jianping and Qi, Xiaojuan and Wang, Xiaogang and Jia, Jiaya},
-  booktitle={CVPR},
-  year={2017}
-}
-```
-
-```bibtex
-@article{wightman2021resnet,
-  title={Resnet strikes back: An improved training procedure in timm},
-  author={Wightman, Ross and Touvron, Hugo and J{\'e}gou, Herv{\'e}},
-  journal={arXiv preprint arXiv:2110.00476},
-  year={2021}
-}
-```
+<div align=center >
+<img alt="PSPNet-R50-D8" src="https://user-images.githubusercontent.com/47882088/209554973-66804b14-de5a-4f83-b54e-26683a91818a.jpg"/>
+PSPNet-R50 D8 model structure
+</div>
 
 ## Results and models
 
 ### Cityscapes
 
-| Method        | Backbone      | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| ------------- | ------------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| PSPNet        | R-50-D8       | 512x1024  |   40000 | 6.1      | 4.07           | 77.85 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py)                     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)                                                                                 |
-| PSPNet        | R-101-D8      | 512x1024  |   40000 | 9.6      | 2.68           | 78.34 |         79.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)                                                                             |
-| PSPNet        | R-50-D8       | 769x769   |   40000 | 6.9      | 1.76           | 78.26 |         79.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_769x769_40k_cityscapes.py)                      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725-86638686.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725.log.json)                                                                                     |
-| PSPNet        | R-101-D8      | 769x769   |   40000 | 10.9     | 1.15           | 79.08 |         80.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_769x769_40k_cityscapes.py)                     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753-61c6f5be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753.log.json)                                                                                 |
-| PSPNet        | R-18-D8       | 512x1024  |   80000 | 1.7      | 15.71          | 74.87 |         76.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes.py)                     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes_20201225_021458-09ffa746.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes-20201225_021458.log.json)                                                                                 |
-| PSPNet        | R-50-D8       | 512x1024  |   80000 | -        | -              | 78.55 |         79.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes.py)                     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131-2376f12b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131.log.json)                                                                                 |
-| PSPNet        | R-50b-D8 rsb  | 512x1024  |   80000 | 6.2      | 3.82           | 78.47 |         79.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238-588c30be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238.log.json)                                           |
-| PSPNet        | R-101-D8      | 512x1024  |   80000 | -        | -              | 79.76 |         81.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211-e1e1100f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211.log.json)                                                                             |
-| PSPNet (FP16) | R-101-D8      | 512x1024  |   80000 | 5.34     | 8.77           | 79.46 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes.py)               | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919-a0875e5c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919.log.json)                                                         |
-| PSPNet        | R-18-D8       | 769x769   |   80000 | 1.9      | 6.20           | 75.90 |         77.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r18-d8_769x769_80k_cityscapes.py)                      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes_20201225_021458-3deefc62.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes-20201225_021458.log.json)                                                                                     |
-| PSPNet        | R-50-D8       | 769x769   |   80000 | -        | -              | 79.59 |         80.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_769x769_80k_cityscapes.py)                      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121-5ccf03dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121.log.json)                                                                                     |
-| PSPNet        | R-101-D8      | 769x769   |   80000 | -        | -              | 79.77 |         81.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_769x769_80k_cityscapes.py)                     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055-dba412fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055.log.json)                                                                                 |
-| PSPNet        | R-18b-D8      | 512x1024  |   80000 | 1.5      | 16.28          | 74.23 |         75.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes_20201226_063116-26928a60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes-20201226_063116.log.json)                                                                             |
-| PSPNet        | R-50b-D8      | 512x1024  |   80000 | 6.0      | 4.30           | 78.22 |         79.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes_20201225_094315-6344287a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes-20201225_094315.log.json)                                                                             |
-| PSPNet        | R-101b-D8     | 512x1024  |   80000 | 9.5      | 2.76           | 79.69 |         80.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes.py)                   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json)                                                                         |
-| PSPNet        | R-18b-D8      | 769x769   |   80000 | 1.7      | 6.41           | 74.92 |         76.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes.py)                     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes_20201226_080942-bf98d186.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes-20201226_080942.log.json)                                                                                 |
-| PSPNet        | R-50b-D8      | 769x769   |   80000 | 6.8      | 1.88           | 78.50 |         79.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes.py)                     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes_20201225_094316-4c643cf6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes-20201225_094316.log.json)                                                                                 |
-| PSPNet        | R-101b-D8     | 769x769   |   80000 | 10.8     | 1.17           | 78.87 |         80.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes_20201226_171823-f0e7c293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes-20201226_171823.log.json)                                                                             |
-| PSPNet        | R-50-D32      | 512x1024  |   80000 | 3.0      | 15.21          | 73.88 |         76.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840-9092b254.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840.log.json)                                                                             |
-| PSPNet        | R-50b-D32 rsb | 512x1024  |   80000 | 3.1      | 16.08          | 74.09 |         77.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229-dd9c9610.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229.log.json) |
-| PSPNet        | R-50b-D32     | 512x1024  |   80000 | 2.9      | 15.41          | 72.61 |         75.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes.py)                   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152-23bcaf8c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152.log.json)                                                                         |
+| Method        | Backbone      | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------------- | ------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PSPNet        | R-50-D8       | 512x1024  |   40000 | 6.1      | 4.07           | V100   | 77.85 |         79.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)                                                                                 |
+| PSPNet        | R-101-D8      | 512x1024  |   40000 | 9.6      | 2.68           | V100   | 78.34 |         79.74 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)                                                                             |
+| PSPNet        | R-50-D8       | 769x769   |   40000 | 6.9      | 1.76           | V100   | 78.26 |         79.88 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py)             | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725-86638686.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725.log.json)                                                                                     |
+| PSPNet        | R-101-D8      | 769x769   |   40000 | 10.9     | 1.15           | V100   | 79.08 |         80.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753-61c6f5be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753.log.json)                                                                                 |
+| PSPNet        | R-18-D8       | 512x1024  |   80000 | 1.7      | 15.71          | V100   | 74.87 |         76.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes_20201225_021458-09ffa746.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes-20201225_021458.log.json)                                                                                 |
+| PSPNet        | R-50-D8       | 512x1024  |   80000 | -        | -              | V100   | 78.55 |         79.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131-2376f12b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131.log.json)                                                                                 |
+| PSPNet        | R-50b-D8 rsb  | 512x1024  |   80000 | 6.2      | 3.82           | V100   | 78.47 |         79.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238-588c30be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238.log.json)                                           |
+| PSPNet        | R-101-D8      | 512x1024  |   80000 | -        | -              | V100   | 79.76 |         81.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211-e1e1100f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211.log.json)                                                                             |
+| PSPNet (FP16) | R-101-D8      | 512x1024  |   80000 | 5.34     | 8.77           | V100   | 79.46 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919-a0875e5c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919.log.json)                                                         |
+| PSPNet        | R-18-D8       | 769x769   |   80000 | 1.9      | 6.20           | V100   | 75.90 |         77.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py)             | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes_20201225_021458-3deefc62.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes-20201225_021458.log.json)                                                                                     |
+| PSPNet        | R-50-D8       | 769x769   |   80000 | -        | -              | V100   | 79.59 |         80.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py)             | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121-5ccf03dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121.log.json)                                                                                     |
+| PSPNet        | R-101-D8      | 769x769   |   80000 | -        | -              | V100   | 79.77 |         81.06 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py)            | [model](https://download.oz1z1penmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055-dba412fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055.log.json)                                                                             |
+| PSPNet        | R-18b-D8      | 512x1024  |   80000 | 1.5      | 16.28          | V100   | 74.23 |         75.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes_20201226_063116-26928a60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes-20201226_063116.log.json)                                                                             |
+| PSPNet        | R-50b-D8      | 512x1024  |   80000 | 6.0      | 4.30           | V100   | 78.22 |         79.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes_20201225_094315-6344287a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes-20201225_094315.log.json)                                                                             |
+| PSPNet        | R-101b-D8     | 512x1024  |   80000 | 9.5      | 2.76           | V100   | 79.69 |         80.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json)                                                                         |
+| PSPNet        | R-18b-D8      | 769x769   |   80000 | 1.7      | 6.41           | V100   | 74.92 |         76.90 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes_20201226_080942-bf98d186.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes-20201226_080942.log.json)                                                                                 |
+| PSPNet        | R-50b-D8      | 769x769   |   80000 | 6.8      | 1.88           | V100   | 78.50 |         79.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes_20201225_094316-4c643cf6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes-20201225_094316.log.json)                                                                                 |
+| PSPNet        | R-101b-D8     | 769x769   |   80000 | 10.8     | 1.17           | V100   | 78.87 |         80.04 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes_20201226_171823-f0e7c293.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes-20201226_171823.log.json)                                                                             |
+| PSPNet        | R-50-D32      | 512x1024  |   80000 | 3.0      | 15.21          | V100   | 73.88 |         76.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840-9092b254.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840.log.json)                                                                             |
+| PSPNet        | R-50b-D32 rsb | 512x1024  |   80000 | 3.1      | 16.08          | V100   | 74.09 |         77.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229-dd9c9610.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229.log.json) |
+| PSPNet        | R-50b-D32     | 512x1024  |   80000 | 2.9      | 15.41          | V100   | 72.61 |         75.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152-23bcaf8c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152.log.json)                                                                         |
 
 ### ADE20K
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-50-D8  | 512x512   |   80000 | 8.5      | 23.53          | 41.13 |         41.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128-15a8b914.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128.log.json)         |
-| PSPNet | R-101-D8 | 512x512   |   80000 | 12       | 15.30          | 43.57 |         44.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423-b6e782f0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423.log.json)     |
-| PSPNet | R-50-D8  | 512x512   |  160000 | -        | -              | 42.48 |         43.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358-1890b0bd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |  160000 | -        | -              | 44.39 |         45.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650-967c316f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 8.5      | 23.53          | V100   | 41.13 |         41.94 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128-15a8b914.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128.log.json)         |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 12       | 15.30          | V100   | 43.57 |         44.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423-b6e782f0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 42.48 |         43.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358-1890b0bd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 44.39 |         45.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650-967c316f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650.log.json) |
 
 ### Pascal VOC 2012 + Aug
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-50-D8  | 512x512   |   20000 | 6.1      | 23.59          | 76.78 |         77.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958-ed5dfbd9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |   20000 | 9.6      | 15.02          | 78.47 |         79.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003-4aef3c9a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003.log.json) |
-| PSPNet | R-50-D8  | 512x512   |   40000 | -        | -              | 77.29 |         78.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222-ae9c1b8c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |   40000 | -        | -              | 78.52 |         79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222-bc933b18.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-50-D8  | 512x512   |   20000 | 6.1      | 23.59          | V100   | 76.78 |         77.61 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958-ed5dfbd9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   20000 | 9.6      | 15.02          | V100   | 78.47 |         79.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003-4aef3c9a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003.log.json) |
+| PSPNet | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 77.29 |         78.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222-ae9c1b8c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 78.52 |         79.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222-bc933b18.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222.log.json) |
 
 ### Pascal Context
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                             |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-101-D8 | 480x480   |   40000 | 8.8      | 9.68           | 46.60 |         47.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context_20200911_211210-bf0f5d7c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context-20200911_211210.log.json) |
-| PSPNet | R-101-D8 | 480x480   |   80000 | -        | -              | 46.03 |         47.15 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context_20200911_190530-c86d6233.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context-20200911_190530.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                             |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-101-D8 | 480x480   |   40000 | 8.8      | 9.68           | V100   | 46.60 |         47.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context_20200911_211210-bf0f5d7c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context-20200911_211210.log.json) |
+| PSPNet | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 46.03 |         47.15 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context_20200911_190530-c86d6233.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context-20200911_190530.log.json) |
 
 ### Pascal Context 59
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                                         |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-101-D8 | 480x480   |   40000 | -        | -              | 52.02 |         53.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59_20210416_114524-86d44cd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59-20210416_114524.log.json) |
-| PSPNet | R-101-D8 | 480x480   |   80000 | -        | -              | 52.47 |         53.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59_20210416_114418-fa6caaa2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59-20210416_114418.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                               | download                                                                                                                                                                                                                                                                                                                                                                         |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-101-D8 | 480x480   |   40000 | -        | -              | V100   | 52.02 |         53.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59_20210416_114524-86d44cd4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59-20210416_114524.log.json) |
+| PSPNet | R-101-D8 | 480x480   |   80000 | -        | -              | V100   | 52.47 |         53.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59_20210416_114418-fa6caaa2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59-20210416_114418.log.json) |
 
 ### Dark Zurich and Nighttime Driving
 
 We support evaluation results on these two datasets using models above trained on Cityscapes training set.
 
-| Method | Backbone  | Training Dataset        | Test Dataset              | mIoU  | config                                                                                                                          | evaluation checkpoint                                                                                                                                                                                                                                                                                                                                        |
-| ------ | --------- | ----------------------- | ------------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| PSPNet | R-50-D8   | Cityscapes Training set | Dark Zurich               | 10.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x1024_40k_dark.py)            | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
-| PSPNet | R-50-D8   | Cityscapes Training set | Nighttime Driving         | 23.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x1024_40k_night_driving.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
-| PSPNet | R-50-D8   | Cityscapes Training set | Cityscapes Validation set | 77.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
-| PSPNet | R-101-D8  | Cityscapes Training set | Dark Zurich               | 10.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x1024_40k_dark.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
-| PSPNet | R-101-D8  | Cityscapes Training set | Nighttime Driving         | 20.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x1024_40k_night_driving.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
-| PSPNet | R-101-D8  | Cityscapes Training set | Cityscapes Validation set | 78.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
-| PSPNet | R-101b-D8 | Cityscapes Training set | Dark Zurich               | 15.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101b-d8_512x1024_80k_dark.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
-| PSPNet | R-101b-D8 | Cityscapes Training set | Nighttime Driving         | 22.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101b-d8_512x1024_80k_night_driving.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
-| PSPNet | R-101b-D8 | Cityscapes Training set | Cityscapes Validation set | 79.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
+| Method | Backbone  | Training Dataset        | Test Dataset              | mIoU  | config                                                                                                                                                  | evaluation checkpoint                                                                                                                                                                                                                                                                                                                                        |
+| ------ | --------- | ----------------------- | ------------------------- | ----- | ------------------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PSPNet | R-50-D8   | Cityscapes Training set | Dark Zurich               | 10.91 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
+| PSPNet | R-50-D8   | Cityscapes Training set | Nighttime Driving         | 23.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
+| PSPNet | R-50-D8   | Cityscapes Training set | Cityscapes Validation set | 77.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)                           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json)         |
+| PSPNet | R-101-D8  | Cityscapes Training set | Dark Zurich               | 10.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
+| PSPNet | R-101-D8  | Cityscapes Training set | Nighttime Driving         | 20.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
+| PSPNet | R-101-D8  | Cityscapes Training set | Cityscapes Validation set | 78.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py)                          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json)     |
+| PSPNet | R-101b-D8 | Cityscapes Training set | Dark Zurich               | 15.54 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
+| PSPNet | R-101b-D8 | Cityscapes Training set | Nighttime Driving         | 22.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
+| PSPNet | R-101b-D8 | Cityscapes Training set | Cityscapes Validation set | 79.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py)                         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json) |
 
 ### COCO-Stuff 10k
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                                         |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-50-D8  | 512x512   |   20000 | 9.6      | 20.5           | 35.69 |         36.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258-b88df27f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |   20000 | 13.2     | 11.1           | 37.26 |         38.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135-76aae482.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135.log.json) |
-| PSPNet | R-50-D8  | 512x512   |   40000 | -        | -              | 36.33 |         37.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857-92e2902b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |   40000 | -        | -              | 37.76 |         38.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022-831aec95.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                           | download                                                                                                                                                                                                                                                                                                                                                                         |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-50-D8  | 512x512   |   20000 | 9.6      | 20.5           | V100   | 35.69 |         36.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258-b88df27f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   20000 | 13.2     | 11.1           | V100   | 37.26 |         38.52 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135-76aae482.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135.log.json) |
+| PSPNet | R-50-D8  | 512x512   |   40000 | -        | -              | V100   | 36.33 |         37.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857-92e2902b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   40000 | -        | -              | V100   | 37.76 |         38.86 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022-831aec95.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022.log.json) |
 
 ### COCO-Stuff 164k
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-50-D8  | 512x512   |   80000 | 9.6      | 20.5           | 38.80 |         39.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-0e41b2db.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json)         |
-| PSPNet | R-101-D8 | 512x512   |   80000 | 13.2     | 11.1           | 40.34 |         40.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-7eb41789.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json)     |
-| PSPNet | R-50-D8  | 512x512   |  160000 | -        | -              | 39.64 |         39.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-51276a57.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |  160000 | -        | -              | 41.28 |         41.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-4af9621b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json) |
-| PSPNet | R-50-D8  | 512x512   |  320000 | -        | -              | 40.53 |         40.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-be9610cc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |  320000 | -        | -              | 41.95 |         42.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-72220c60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 9.6      | 20.5           | V100   | 38.80 |         39.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-0e41b2db.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json)         |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 13.2     | 11.1           | V100   | 40.34 |         40.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-7eb41789.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |  160000 | -        | -              | V100   | 39.64 |         39.97 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-51276a57.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |  160000 | -        | -              | V100   | 41.28 |         41.66 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-4af9621b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json) |
+| PSPNet | R-50-D8  | 512x512   |  320000 | -        | -              | V100   | 40.53 |         40.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-be9610cc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |  320000 | -        | -              | V100   | 41.95 |         42.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-72220c60.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json) |
 
 ### LoveDA
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                             |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| PSPNet | R-18-D8  | 512x512   |   80000 | 1.45     | 26.87          | 48.62 |         47.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r18-d8_512x512_80k_loveda.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100-b97697f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100.log.json)     |
-| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 6.60           | 50.46 |         50.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_512x512_80k_loveda.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728-88610f9f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 4.58           | 51.86 |         51.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_512x512_80k_loveda.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212-1c06c6a8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                             |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PSPNet | R-18-D8  | 512x512   |   80000 | 1.45     | 26.87          | V100   | 48.62 |         47.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100-b97697f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 6.60           | V100   | 50.46 |         50.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728-88610f9f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 4.58           | V100   | 51.86 |         51.34 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212-1c06c6a8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212.log.json) |
 
 ### Potsdam
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                 |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-18-D8  | 512x512   |   80000 | 1.50     | 85.12          | 77.09 |         78.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612-7cd046e1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json)     |
-| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 30.21          | 78.12 |         78.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541-2dd5fe67.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 19.40          | 78.62 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612-aed036c4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-18-D8  | 512x512   |   80000 | 1.50     | 85.12          | V100   | 77.09 |         78.30 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612-7cd046e1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 30.21          | V100   | 78.12 |         78.98 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541-2dd5fe67.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 19.40          | V100   | 78.62 |         79.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612-aed036c4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json) |
 
 ### Vaihingen
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                         |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-18-D8  | 512x512   |   80000 | 1.45     | 85.06          | 71.46 |         73.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355-52a8a6f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json)     |
-| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 30.29          | 72.36 |         73.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355-382f8f5b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json)     |
-| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 19.97          | 72.61 |         74.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806-8eba0a09.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                                         |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-18-D8  | 512x512   |   80000 | 1.45     | 85.06          | V100   | 71.46 |         73.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355-52a8a6f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json)     |
+| PSPNet | R-50-D8  | 512x512   |   80000 | 6.14     | 30.29          | V100   | 72.36 |         73.75 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355-382f8f5b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json)     |
+| PSPNet | R-101-D8 | 512x512   |   80000 | 9.61     | 19.97          | V100   | 72.61 |         74.18 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806-8eba0a09.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806.log.json) |
 
 ### iSAID
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| PSPNet | R-18-D8  | 896x896   |   80000 | 4.52     | 26.91          | 60.22 |         61.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526-e84c0b6a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
-| PSPNet | R-50-D8  | 896x896   |   80000 | 16.58    | 8.88           | 65.36 |         66.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629-1f21dc32.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| PSPNet | R-18-D8  | 896x896   |   80000 | 4.52     | 26.91          | V100   | 60.22 |         61.25 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526-e84c0b6a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json) |
+| PSPNet | R-50-D8  | 896x896   |   80000 | 16.58    | 8.88           | V100   | 65.36 |         66.48 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629-1f21dc32.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629.log.json) |
 
 Note:
 
@@ -175,3 +160,23 @@ Note:
 - `896x896` is the Crop Size of iSAID dataset, which is followed by the implementation of [PointFlow: Flowing Semantics Through Points for Aerial Image Segmentation](https://arxiv.org/pdf/2103.06564.pdf)
 - `rsb` is short for 'Resnet strikes back'.
 - The `b` in `R-50b` means ResNetV1b, which is a standard ResNet backbone. In MMSegmentation, default backbone is ResNetV1c, which usually performs better in semantic segmentation task.
+
+## Citation
+
+```bibtex
+@inproceedings{zhao2017pspnet,
+  title={Pyramid Scene Parsing Network},
+  author={Zhao, Hengshuang and Shi, Jianping and Qi, Xiaojuan and Wang, Xiaogang and Jia, Jiaya},
+  booktitle={CVPR},
+  year={2017}
+}
+```
+
+```bibtex
+@article{wightman2021resnet,
+  title={Resnet strikes back: An improved training procedure in timm},
+  author={Wightman, Ross and Touvron, Hugo and J{\'e}gou, Herv{\'e}},
+  journal={arXiv preprint arXiv:2110.00476},
+  year={2021}
+}
+```
diff --git a/configs/pspnet/metafile.yaml b/configs/pspnet/metafile.yaml
new file mode 100644
index 0000000000..d00b89d5cf
--- /dev/null
+++ b/configs/pspnet/metafile.yaml
@@ -0,0 +1,1303 @@
+Collections:
+- Name: PSPNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+    - Pascal Context
+    - Pascal Context 59
+    - Dark Zurich and Nighttime Driving
+    - COCO-Stuff 10k
+    - COCO-Stuff 164k
+    - LoveDA
+    - Potsdam
+    - Vaihingen
+    - iSAID
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  README: configs/pspnet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: pspnet_r50-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.85
+      mIoU(ms+flip): 79.18
+  Config: configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-40k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.34
+      mIoU(ms+flip): 79.74
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb2-40k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.26
+      mIoU(ms+flip): 79.88
+  Config: configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725-86638686.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-40k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.08
+      mIoU(ms+flip): 80.28
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753-61c6f5be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.87
+      mIoU(ms+flip): 76.04
+  Config: configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes_20201225_021458-09ffa746.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes-20201225_021458.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.55
+      mIoU(ms+flip): 79.79
+  Config: configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131-2376f12b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.47
+      mIoU(ms+flip): 79.45
+  Config: configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238-588c30be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.76
+      mIoU(ms+flip): 81.01
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211-e1e1100f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.46
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    - (FP16)
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.34
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919-a0875e5c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.9
+      mIoU(ms+flip): 77.86
+  Config: configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes_20201225_021458-3deefc62.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes-20201225_021458.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.59
+      mIoU(ms+flip): 80.69
+  Config: configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121-5ccf03dd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.77
+      mIoU(ms+flip): 81.06
+  Config: configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.oz1z1penmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055-dba412fa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.23
+      mIoU(ms+flip): 75.79
+  Config: configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes_20201226_063116-26928a60.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes-20201226_063116.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.22
+      mIoU(ms+flip): 79.46
+  Config: configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes_20201225_094315-6344287a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes-20201225_094315.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.69
+      mIoU(ms+flip): 80.79
+  Config: configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes-20201226_170012.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.92
+      mIoU(ms+flip): 76.9
+  Config: configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-18b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.7
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes_20201226_080942-bf98d186.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes-20201226_080942.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.5
+      mIoU(ms+flip): 79.96
+  Config: configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes_20201225_094316-4c643cf6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes-20201225_094316.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101b-d8_4xb2-80k_cityscapes-769x769
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.87
+      mIoU(ms+flip): 80.04
+  Config: configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101b-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 10.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes_20201226_171823-f0e7c293.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes-20201226_171823.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.88
+      mIoU(ms+flip): 76.85
+  Config: configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50-D32
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840-9092b254.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.09
+      mIoU(ms+flip): 77.18
+  Config: configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D32
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229-dd9c9610.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 72.61
+      mIoU(ms+flip): 75.51
+  Config: configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50b-D32
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152-23bcaf8c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.13
+      mIoU(ms+flip): 41.94
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128-15a8b914.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.57
+      mIoU(ms+flip): 44.35
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 12.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423-b6e782f0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-160k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.48
+      mIoU(ms+flip): 43.44
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358-1890b0bd.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-160k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.39
+      mIoU(ms+flip): 45.35
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650-967c316f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-20k_voc12aug-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 76.78
+      mIoU(ms+flip): 77.61
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958-ed5dfbd9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-20k_voc12aug-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.47
+      mIoU(ms+flip): 79.25
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003-4aef3c9a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-40k_voc12aug-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.29
+      mIoU(ms+flip): 78.48
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222-ae9c1b8c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-40k_voc12aug-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 78.52
+      mIoU(ms+flip): 79.57
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222-bc933b18.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-40k_pascal-context-480x480
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 46.6
+      mIoU(ms+flip): 47.78
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context_20200911_211210-bf0f5d7c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context-20200911_211210.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_pascal-context-480x480
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context
+    Metrics:
+      mIoU: 46.03
+      mIoU(ms+flip): 47.15
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py
+  Metadata:
+    Training Data: Pascal Context
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context_20200911_190530-c86d6233.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context-20200911_190530.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.02
+      mIoU(ms+flip): 53.54
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59_20210416_114524-86d44cd4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59-20210416_114524.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal Context 59
+    Metrics:
+      mIoU: 52.47
+      mIoU(ms+flip): 53.99
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py
+  Metadata:
+    Training Data: Pascal Context 59
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59_20210416_114418-fa6caaa2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59-20210416_114418.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 35.69
+      mIoU(ms+flip): 36.62
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258-b88df27f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 37.26
+      mIoU(ms+flip): 38.52
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135-76aae482.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 36.33
+      mIoU(ms+flip): 37.24
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857-92e2902b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 10k
+    Metrics:
+      mIoU: 37.76
+      mIoU(ms+flip): 38.86
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 10k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022-831aec95.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 38.8
+      mIoU(ms+flip): 39.19
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-0e41b2db.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 40.34
+      mIoU(ms+flip): 40.79
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-7eb41789.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 39.64
+      mIoU(ms+flip): 39.97
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-51276a57.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.28
+      mIoU(ms+flip): 41.66
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-4af9621b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 40.53
+      mIoU(ms+flip): 40.75
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-be9610cc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.95
+      mIoU(ms+flip): 42.42
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-72220c60.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb4-80k_loveda-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 48.62
+      mIoU(ms+flip): 47.57
+  Config: configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.45
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100-b97697f1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_loveda-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 50.46
+      mIoU(ms+flip): 50.19
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.14
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728-88610f9f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_loveda-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: LoveDA
+    Metrics:
+      mIoU: 51.86
+      mIoU(ms+flip): 51.34
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py
+  Metadata:
+    Training Data: LoveDA
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212-1c06c6a8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb4-80k_potsdam-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 77.09
+      mIoU(ms+flip): 78.3
+  Config: configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612-7cd046e1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_potsdam-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.12
+      mIoU(ms+flip): 78.98
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.14
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541-2dd5fe67.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_potsdam-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Potsdam
+    Metrics:
+      mIoU: 78.62
+      mIoU(ms+flip): 79.47
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py
+  Metadata:
+    Training Data: Potsdam
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612-aed036c4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb4-80k_vaihingen-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 71.46
+      mIoU(ms+flip): 73.36
+  Config: configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.45
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355-52a8a6f6.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_vaihingen-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.36
+      mIoU(ms+flip): 73.75
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.14
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355-382f8f5b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r101-d8_4xb4-80k_vaihingen-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Vaihingen
+    Metrics:
+      mIoU: 72.61
+      mIoU(ms+flip): 74.18
+  Config: configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py
+  Metadata:
+    Training Data: Vaihingen
+    Batch Size: 16
+    Architecture:
+    - R-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806-8eba0a09.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r18-d8_4xb4-80k_isaid-896x896
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 60.22
+      mIoU(ms+flip): 61.25
+  Config: configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - R-18-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.52
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526-e84c0b6a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
+- Name: pspnet_r50-d8_4xb4-80k_isaid-896x896
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: iSAID
+    Metrics:
+      mIoU: 65.36
+      mIoU(ms+flip): 66.48
+  Config: configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py
+  Metadata:
+    Training Data: iSAID
+    Batch Size: 16
+    Architecture:
+    - R-50-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 16.58
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629-1f21dc32.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629.log.json
+  Paper:
+    Title: Pyramid Scene Parsing Network
+    URL: https://arxiv.org/abs/1612.01105
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
+  Framework: PyTorch
diff --git a/configs/pspnet/pspnet.yml b/configs/pspnet/pspnet.yml
deleted file mode 100644
index 2a1fa8882b..0000000000
--- a/configs/pspnet/pspnet.yml
+++ /dev/null
@@ -1,1077 +0,0 @@
-Collections:
-- Name: PSPNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-    - Pascal Context
-    - Pascal Context 59
-    - Dark Zurich and Nighttime Driving
-    - COCO-Stuff 10k
-    - COCO-Stuff 164k
-    - LoveDA
-    - Potsdam
-    - Vaihingen
-    - iSAID
-  Paper:
-    URL: https://arxiv.org/abs/1612.01105
-    Title: Pyramid Scene Parsing Network
-  README: configs/pspnet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/psp_head.py#L63
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/hszhao/PSPNet
-Models:
-- Name: pspnet_r50-d8_512x1024_40k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 245.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.85
-      mIoU(ms+flip): 79.18
-  Config: configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth
-- Name: pspnet_r101-d8_512x1024_40k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 373.13
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.34
-      mIoU(ms+flip): 79.74
-  Config: configs/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes/pspnet_r101-d8_512x1024_40k_cityscapes_20200604_232751-467e7cf4.pth
-- Name: pspnet_r50-d8_769x769_40k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 568.18
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.26
-      mIoU(ms+flip): 79.88
-  Config: configs/pspnet/pspnet_r50-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_40k_cityscapes/pspnet_r50-d8_769x769_40k_cityscapes_20200606_112725-86638686.pth
-- Name: pspnet_r101-d8_769x769_40k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 869.57
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.08
-      mIoU(ms+flip): 80.28
-  Config: configs/pspnet/pspnet_r101-d8_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_40k_cityscapes/pspnet_r101-d8_769x769_40k_cityscapes_20200606_112753-61c6f5be.pth
-- Name: pspnet_r18-d8_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 63.65
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 1.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.87
-      mIoU(ms+flip): 76.04
-  Config: configs/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes/pspnet_r18-d8_512x1024_80k_cityscapes_20201225_021458-09ffa746.pth
-- Name: pspnet_r50-d8_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.55
-      mIoU(ms+flip): 79.79
-  Config: configs/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_512x1024_80k_cityscapes_20200606_112131-2376f12b.pth
-- Name: pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50b-D8 rsb
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 261.78
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.47
-      mIoU(ms+flip): 79.45
-  Config: configs/pspnet/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220315_123238-588c30be.pth
-- Name: pspnet_r101-d8_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.76
-      mIoU(ms+flip): 81.01
-  Config: configs/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes/pspnet_r101-d8_512x1024_80k_cityscapes_20200606_112211-e1e1100f.pth
-- Name: pspnet_r101-d8_fp16_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 114.03
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP16
-      resolution: (512,1024)
-    Training Memory (GB): 5.34
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.46
-  Config: configs/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes/pspnet_r101-d8_fp16_512x1024_80k_cityscapes_20200717_230919-a0875e5c.pth
-- Name: pspnet_r18-d8_769x769_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 161.29
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 1.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.9
-      mIoU(ms+flip): 77.86
-  Config: configs/pspnet/pspnet_r18-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_769x769_80k_cityscapes/pspnet_r18-d8_769x769_80k_cityscapes_20201225_021458-3deefc62.pth
-- Name: pspnet_r50-d8_769x769_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.59
-      mIoU(ms+flip): 80.69
-  Config: configs/pspnet/pspnet_r50-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_769x769_80k_cityscapes/pspnet_r50-d8_769x769_80k_cityscapes_20200606_210121-5ccf03dd.pth
-- Name: pspnet_r101-d8_769x769_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.77
-      mIoU(ms+flip): 81.06
-  Config: configs/pspnet/pspnet_r101-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_769x769_80k_cityscapes/pspnet_r101-d8_769x769_80k_cityscapes_20200606_225055-dba412fa.pth
-- Name: pspnet_r18b-d8_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-18b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 61.43
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 1.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.23
-      mIoU(ms+flip): 75.79
-  Config: configs/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes/pspnet_r18b-d8_512x1024_80k_cityscapes_20201226_063116-26928a60.pth
-- Name: pspnet_r50b-d8_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 232.56
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.22
-      mIoU(ms+flip): 79.46
-  Config: configs/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes/pspnet_r50b-d8_512x1024_80k_cityscapes_20201225_094315-6344287a.pth
-- Name: pspnet_r101b-d8_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101b-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 362.32
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 9.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.69
-      mIoU(ms+flip): 80.79
-  Config: configs/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes/pspnet_r101b-d8_512x1024_80k_cityscapes_20201226_170012-3a4d38ab.pth
-- Name: pspnet_r18b-d8_769x769_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-18b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 156.01
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 1.7
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.92
-      mIoU(ms+flip): 76.9
-  Config: configs/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes/pspnet_r18b-d8_769x769_80k_cityscapes_20201226_080942-bf98d186.pth
-- Name: pspnet_r50b-d8_769x769_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 531.91
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 6.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.5
-      mIoU(ms+flip): 79.96
-  Config: configs/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes/pspnet_r50b-d8_769x769_80k_cityscapes_20201225_094316-4c643cf6.pth
-- Name: pspnet_r101b-d8_769x769_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101b-D8
-    crop size: (769,769)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 854.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 10.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.87
-      mIoU(ms+flip): 80.04
-  Config: configs/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes/pspnet_r101b-d8_769x769_80k_cityscapes_20201226_171823-f0e7c293.pth
-- Name: pspnet_r50-d32_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 65.75
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.88
-      mIoU(ms+flip): 76.85
-  Config: configs/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes/pspnet_r50-d32_512x1024_80k_cityscapes_20220316_224840-9092b254.pth
-- Name: pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50b-D32 rsb
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 62.19
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.09
-      mIoU(ms+flip): 77.18
-  Config: configs/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes_20220316_141229-dd9c9610.pth
-- Name: pspnet_r50b-d32_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50b-D32
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 64.89
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 2.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 72.61
-      mIoU(ms+flip): 75.51
-  Config: configs/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes/pspnet_r50b-d32_512x1024_80k_cityscapes_20220311_152152-23bcaf8c.pth
-- Name: pspnet_r50-d8_512x512_80k_ade20k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 42.5
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 41.13
-      mIoU(ms+flip): 41.94
-  Config: configs/pspnet/pspnet_r50-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_ade20k/pspnet_r50-d8_512x512_80k_ade20k_20200615_014128-15a8b914.pth
-- Name: pspnet_r101-d8_512x512_80k_ade20k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 65.36
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 12.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.57
-      mIoU(ms+flip): 44.35
-  Config: configs/pspnet/pspnet_r101-d8_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_ade20k/pspnet_r101-d8_512x512_80k_ade20k_20200614_031423-b6e782f0.pth
-- Name: pspnet_r50-d8_512x512_160k_ade20k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.48
-      mIoU(ms+flip): 43.44
-  Config: configs/pspnet/pspnet_r50-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_160k_ade20k/pspnet_r50-d8_512x512_160k_ade20k_20200615_184358-1890b0bd.pth
-- Name: pspnet_r101-d8_512x512_160k_ade20k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.39
-      mIoU(ms+flip): 45.35
-  Config: configs/pspnet/pspnet_r101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_160k_ade20k/pspnet_r101-d8_512x512_160k_ade20k_20200615_100650-967c316f.pth
-- Name: pspnet_r50-d8_512x512_20k_voc12aug
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 42.39
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 76.78
-      mIoU(ms+flip): 77.61
-  Config: configs/pspnet/pspnet_r50-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_20k_voc12aug/pspnet_r50-d8_512x512_20k_voc12aug_20200617_101958-ed5dfbd9.pth
-- Name: pspnet_r101-d8_512x512_20k_voc12aug
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 66.58
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 78.47
-      mIoU(ms+flip): 79.25
-  Config: configs/pspnet/pspnet_r101-d8_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_20k_voc12aug/pspnet_r101-d8_512x512_20k_voc12aug_20200617_102003-4aef3c9a.pth
-- Name: pspnet_r50-d8_512x512_40k_voc12aug
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.29
-      mIoU(ms+flip): 78.48
-  Config: configs/pspnet/pspnet_r50-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_40k_voc12aug/pspnet_r50-d8_512x512_40k_voc12aug_20200613_161222-ae9c1b8c.pth
-- Name: pspnet_r101-d8_512x512_40k_voc12aug
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 78.52
-      mIoU(ms+flip): 79.57
-  Config: configs/pspnet/pspnet_r101-d8_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_40k_voc12aug/pspnet_r101-d8_512x512_40k_voc12aug_20200613_161222-bc933b18.pth
-- Name: pspnet_r101-d8_480x480_40k_pascal_context
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 103.31
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (480,480)
-    Training Memory (GB): 8.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 46.6
-      mIoU(ms+flip): 47.78
-  Config: configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context/pspnet_r101-d8_480x480_40k_pascal_context_20200911_211210-bf0f5d7c.pth
-- Name: pspnet_r101-d8_480x480_80k_pascal_context
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context
-    Metrics:
-      mIoU: 46.03
-      mIoU(ms+flip): 47.15
-  Config: configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context/pspnet_r101-d8_480x480_80k_pascal_context_20200911_190530-c86d6233.pth
-- Name: pspnet_r101-d8_480x480_40k_pascal_context_59
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 52.02
-      mIoU(ms+flip): 53.54
-  Config: configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59/pspnet_r101-d8_480x480_40k_pascal_context_59_20210416_114524-86d44cd4.pth
-- Name: pspnet_r101-d8_480x480_80k_pascal_context_59
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (480,480)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal Context 59
-    Metrics:
-      mIoU: 52.47
-      mIoU(ms+flip): 53.99
-  Config: configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59/pspnet_r101-d8_480x480_80k_pascal_context_59_20210416_114418-fa6caaa2.pth
-- Name: pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 48.78
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 10k
-    Metrics:
-      mIoU: 35.69
-      mIoU(ms+flip): 36.62
-  Config: configs/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k_20210820_203258-b88df27f.pth
-- Name: pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 90.09
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 13.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 10k
-    Metrics:
-      mIoU: 37.26
-      mIoU(ms+flip): 38.52
-  Config: configs/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k_20210820_232135-76aae482.pth
-- Name: pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 10k
-    Metrics:
-      mIoU: 36.33
-      mIoU(ms+flip): 37.24
-  Config: configs/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k_20210821_030857-92e2902b.pth
-- Name: pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 10k
-    Metrics:
-      mIoU: 37.76
-      mIoU(ms+flip): 38.86
-  Config: configs/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k_20210821_014022-831aec95.pth
-- Name: pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 48.78
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 38.8
-      mIoU(ms+flip): 39.19
-  Config: configs/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-0e41b2db.pth
-- Name: pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 90.09
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 13.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 40.34
-      mIoU(ms+flip): 40.79
-  Config: configs/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k_20210707_152034-7eb41789.pth
-- Name: pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 39.64
-      mIoU(ms+flip): 39.97
-  Config: configs/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-51276a57.pth
-- Name: pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 41.28
-      mIoU(ms+flip): 41.66
-  Config: configs/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k_20210707_152004-4af9621b.pth
-- Name: pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 320000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 40.53
-      mIoU(ms+flip): 40.75
-  Config: configs/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-be9610cc.pth
-- Name: pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 320000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: COCO-Stuff 164k
-    Metrics:
-      mIoU: 41.95
-      mIoU(ms+flip): 42.42
-  Config: configs/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k_20210707_152004-72220c60.pth
-- Name: pspnet_r18-d8_512x512_80k_loveda
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 37.22
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.45
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 48.62
-      mIoU(ms+flip): 47.57
-  Config: configs/pspnet/pspnet_r18-d8_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_512x512_80k_loveda/pspnet_r18-d8_512x512_80k_loveda_20211105_052100-b97697f1.pth
-- Name: pspnet_r50-d8_512x512_80k_loveda
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 151.52
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.14
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 50.46
-      mIoU(ms+flip): 50.19
-  Config: configs/pspnet/pspnet_r50-d8_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x512_80k_loveda/pspnet_r50-d8_512x512_80k_loveda_20211104_155728-88610f9f.pth
-- Name: pspnet_r101-d8_512x512_80k_loveda
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 218.34
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.61
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: LoveDA
-    Metrics:
-      mIoU: 51.86
-      mIoU(ms+flip): 51.34
-  Config: configs/pspnet/pspnet_r101-d8_512x512_80k_loveda.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_512x512_80k_loveda/pspnet_r101-d8_512x512_80k_loveda_20211104_153212-1c06c6a8.pth
-- Name: pspnet_r18-d8_4x4_512x512_80k_potsdam
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 11.75
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 77.09
-      mIoU(ms+flip): 78.3
-  Config: configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam/pspnet_r18-d8_4x4_512x512_80k_potsdam_20211220_125612-7cd046e1.pth
-- Name: pspnet_r50-d8_4x4_512x512_80k_potsdam
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 33.1
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.14
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 78.12
-      mIoU(ms+flip): 78.98
-  Config: configs/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam/pspnet_r50-d8_4x4_512x512_80k_potsdam_20211219_043541-2dd5fe67.pth
-- Name: pspnet_r101-d8_4x4_512x512_80k_potsdam
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 51.55
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.61
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Potsdam
-    Metrics:
-      mIoU: 78.62
-      mIoU(ms+flip): 79.47
-  Config: configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam/pspnet_r101-d8_4x4_512x512_80k_potsdam_20211220_125612-aed036c4.pth
-- Name: pspnet_r18-d8_4x4_512x512_80k_vaihingen
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 11.76
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.45
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 71.46
-      mIoU(ms+flip): 73.36
-  Config: configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen/pspnet_r18-d8_4x4_512x512_80k_vaihingen_20211228_160355-52a8a6f6.pth
-- Name: pspnet_r50-d8_4x4_512x512_80k_vaihingen
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 33.01
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.14
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 72.36
-      mIoU(ms+flip): 73.75
-  Config: configs/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen/pspnet_r50-d8_4x4_512x512_80k_vaihingen_20211228_160355-382f8f5b.pth
-- Name: pspnet_r101-d8_4x4_512x512_80k_vaihingen
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-101-D8
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 50.08
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.61
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Vaihingen
-    Metrics:
-      mIoU: 72.61
-      mIoU(ms+flip): 74.18
-  Config: configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen/pspnet_r101-d8_4x4_512x512_80k_vaihingen_20211231_230806-8eba0a09.pth
-- Name: pspnet_r18-d8_4x4_896x896_80k_isaid
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-18-D8
-    crop size: (896,896)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 37.16
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (896,896)
-    Training Memory (GB): 4.52
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: iSAID
-    Metrics:
-      mIoU: 60.22
-      mIoU(ms+flip): 61.25
-  Config: configs/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid/pspnet_r18-d8_4x4_896x896_80k_isaid_20220110_180526-e84c0b6a.pth
-- Name: pspnet_r50-d8_4x4_896x896_80k_isaid
-  In Collection: PSPNet
-  Metadata:
-    backbone: R-50-D8
-    crop size: (896,896)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 112.61
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (896,896)
-    Training Memory (GB): 16.58
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: iSAID
-    Metrics:
-      mIoU: 65.36
-      mIoU(ms+flip): 66.48
-  Config: configs/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid/pspnet_r50-d8_4x4_896x896_80k_isaid_20220110_180629-1f21dc32.pth
diff --git a/configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context.py b/configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context.py
deleted file mode 100644
index 0b5a990604..0000000000
--- a/configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_480x480_40k_pascal_context.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59.py b/configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59.py
deleted file mode 100644
index 081cb3732a..0000000000
--- a/configs/pspnet/pspnet_r101-d8_480x480_40k_pascal_context_59.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_480x480_40k_pascal_context_59.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context.py b/configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context.py
deleted file mode 100644
index fda9110603..0000000000
--- a/configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_480x480_80k_pascal_context.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59.py b/configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59.py
deleted file mode 100644
index 795c51f8cf..0000000000
--- a/configs/pspnet/pspnet_r101-d8_480x480_80k_pascal_context_59.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_480x480_80k_pascal_context_59.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam.py b/configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam.py
deleted file mode 100644
index 98343dd76c..0000000000
--- a/configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_potsdam.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_4x4_512x512_80k_potsdam.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen.py b/configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen.py
deleted file mode 100644
index fd79492e76..0000000000
--- a/configs/pspnet/pspnet_r101-d8_4x4_512x512_80k_vaihingen.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_4x4_512x512_80k_vaihingen.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py b/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..f33d653b76
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py b/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py
new file mode 100644
index 0000000000..5babaa8851
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py'  # noqa
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py b/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py
new file mode 100644
index 0000000000..a9480c52f8
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py'  # noqa
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py b/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..e05cff6d8e
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py b/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..6704cdd5d2
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py b/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..3733e69198
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py b/configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..52f86b5e75
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb2-amp-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+_base_ = './pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
+    loss_scale=512.)
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..2231049b8a
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..f5390f8c76
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-160k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000..84a986cd9d
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-20k_coco-stuff10k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..71897ddc2d
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..ebaea36da8
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-320k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
new file mode 100644
index 0000000000..2a55f53ee9
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-40k_coco-stuff10k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py b/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py
new file mode 100644
index 0000000000..205d00bac9
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-40k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py b/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..0d7c176073
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-40k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-40k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..0599f31f96
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..f95560347a
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
new file mode 100644
index 0000000000..4a34f97485
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-80k_coco-stuff164k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000..7076877980
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_loveda-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py b/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py
new file mode 100644
index 0000000000..0ac40dc861
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_pascal-context-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py b/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py
new file mode 100644
index 0000000000..307188c783
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-80k_pascal-context-59-480x480.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_pascal-context-59-480x480.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000..31ed2f2938
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_potsdam-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py b/configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000..ac33ed7cda
--- /dev/null
+++ b/configs/pspnet/pspnet_r101-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes.py b/configs/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes.py
deleted file mode 100644
index 38fee11bc2..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x1024_40k_dark.py b/configs/pspnet/pspnet_r101-d8_512x1024_40k_dark.py
deleted file mode 100644
index 1057639148..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x1024_40k_dark.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_40k_dark.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x1024_40k_night_driving.py b/configs/pspnet/pspnet_r101-d8_512x1024_40k_night_driving.py
deleted file mode 100644
index 0ecb9303ab..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x1024_40k_night_driving.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_40k_night_driving.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 9931a07bc2..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_160k_ade20k.py b/configs/pspnet/pspnet_r101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 6107b41544..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_20k_voc12aug.py b/configs/pspnet/pspnet_r101-d8_512x512_20k_voc12aug.py
deleted file mode 100644
index 2221b202d6..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_40k_voc12aug.py b/configs/pspnet/pspnet_r101-d8_512x512_40k_voc12aug.py
deleted file mode 100644
index 15f578b600..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k.py b/configs/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k.py
deleted file mode 100644
index 7ae2061c51..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_4x4_160k_coco-stuff164k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k.py b/configs/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k.py
deleted file mode 100644
index a448496b13..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_4x4_20k_coco-stuff10k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k.py b/configs/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k.py
deleted file mode 100644
index 90512b8754..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_4x4_320k_coco-stuff164k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k.py b/configs/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k.py
deleted file mode 100644
index 36aa44385f..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_4x4_40k_coco-stuff10k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k.py b/configs/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k.py
deleted file mode 100644
index fdddec4658..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_4x4_80k_coco-stuff164k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_80k_ade20k.py b/configs/pspnet/pspnet_r101-d8_512x512_80k_ade20k.py
deleted file mode 100644
index fb7c3d55d5..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_512x512_80k_loveda.py b/configs/pspnet/pspnet_r101-d8_512x512_80k_loveda.py
deleted file mode 100644
index 03c0251f6c..0000000000
--- a/configs/pspnet/pspnet_r101-d8_512x512_80k_loveda.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_80k_loveda.py'
-model = dict(
-    backbone=dict(
-        depth=101,
-        init_cfg=dict(
-            type='Pretrained', checkpoint='open-mmlab://resnet101_v1c')))
diff --git a/configs/pspnet/pspnet_r101-d8_769x769_40k_cityscapes.py b/configs/pspnet/pspnet_r101-d8_769x769_40k_cityscapes.py
deleted file mode 100644
index c6e7e58508..0000000000
--- a/configs/pspnet/pspnet_r101-d8_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_769x769_80k_cityscapes.py b/configs/pspnet/pspnet_r101-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 59b8c6dd5e..0000000000
--- a/configs/pspnet/pspnet_r101-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes.py
deleted file mode 100644
index c84005ab6b..0000000000
--- a/configs/pspnet/pspnet_r101-d8_fp16_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './pspnet_r101-d8_512x1024_80k_cityscapes.py'
-optim_wrapper = dict(
-    _delete_=True,
-    type='AmpOptimWrapper',
-    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
-    loss_scale=512.)
diff --git a/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..d2c0f69638
--- /dev/null
+++ b/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,4 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py b/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py
new file mode 100644
index 0000000000..b181744149
--- /dev/null
+++ b/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py
@@ -0,0 +1,4 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py'  # noqa
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py b/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py
new file mode 100644
index 0000000000..6a8994b4c8
--- /dev/null
+++ b/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py
@@ -0,0 +1,4 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py'  # noqa
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py b/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..891bfd51ed
--- /dev/null
+++ b/configs/pspnet/pspnet_r101b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,4 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet101',
+    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index ab8a3d3e3f..0000000000
--- a/configs/pspnet/pspnet_r101b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/pspnet/pspnet_r101b-d8_512x1024_80k_dark.py b/configs/pspnet/pspnet_r101b-d8_512x1024_80k_dark.py
deleted file mode 100644
index 49231d81bc..0000000000
--- a/configs/pspnet/pspnet_r101b-d8_512x1024_80k_dark.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_80k_dark.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/pspnet/pspnet_r101b-d8_512x1024_80k_night_driving.py b/configs/pspnet/pspnet_r101b-d8_512x1024_80k_night_driving.py
deleted file mode 100644
index c3ed2f147b..0000000000
--- a/configs/pspnet/pspnet_r101b-d8_512x1024_80k_night_driving.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_80k_night_driving.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes.py b/configs/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 1a7cb708e5..0000000000
--- a/configs/pspnet/pspnet_r101b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,4 +0,0 @@
-_base_ = './pspnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet101',
-    backbone=dict(type='ResNet', depth=101))
diff --git a/configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam.py b/configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam.py
deleted file mode 100644
index be9dc7254b..0000000000
--- a/configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_potsdam.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './pspnet_r50-d8_4x4_512x512_80k_potsdam.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen.py b/configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen.py
deleted file mode 100644
index 2cb69228f8..0000000000
--- a/configs/pspnet/pspnet_r18-d8_4x4_512x512_80k_vaihingen.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './pspnet_r50-d8_4x4_512x512_80k_vaihingen.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid.py b/configs/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid.py
deleted file mode 100644
index 4f6f9ab253..0000000000
--- a/configs/pspnet/pspnet_r18-d8_4x4_896x896_80k_isaid.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './pspnet_r50-d8_4x4_896x896_80k_isaid.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py b/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..a4b342ef23
--- /dev/null
+++ b/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py b/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..0e7f3e90ac
--- /dev/null
+++ b/configs/pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py b/configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py
new file mode 100644
index 0000000000..efce7a0e7d
--- /dev/null
+++ b/configs/pspnet/pspnet_r18-d8_4xb4-80k_isaid-896x896.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_isaid-896x896.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py b/configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py
new file mode 100644
index 0000000000..80e2d20cbe
--- /dev/null
+++ b/configs/pspnet/pspnet_r18-d8_4xb4-80k_loveda-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_loveda-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py b/configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py
new file mode 100644
index 0000000000..1ef0585e79
--- /dev/null
+++ b/configs/pspnet/pspnet_r18-d8_4xb4-80k_potsdam-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_potsdam-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py b/configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py
new file mode 100644
index 0000000000..51e66d2e51
--- /dev/null
+++ b/configs/pspnet/pspnet_r18-d8_4xb4-80k_vaihingen-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index d914f93c02..0000000000
--- a/configs/pspnet/pspnet_r18-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_512x512_80k_loveda.py b/configs/pspnet/pspnet_r18-d8_512x512_80k_loveda.py
deleted file mode 100644
index dbb832b244..0000000000
--- a/configs/pspnet/pspnet_r18-d8_512x512_80k_loveda.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = './pspnet_r50-d8_512x512_80k_loveda.py'
-model = dict(
-    backbone=dict(
-        depth=18,
-        init_cfg=dict(
-            type='Pretrained', checkpoint='open-mmlab://resnet18_v1c')),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18-d8_769x769_80k_cityscapes.py b/configs/pspnet/pspnet_r18-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 5893e66a41..0000000000
--- a/configs/pspnet/pspnet_r18-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './pspnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnet18_v1c',
-    backbone=dict(depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..2e356c5c5f
--- /dev/null
+++ b/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py b/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..831354d4ce
--- /dev/null
+++ b/configs/pspnet/pspnet_r18b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,9 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(
+    pretrained='torchvision://resnet18',
+    backbone=dict(type='ResNet', depth=18),
+    decode_head=dict(
+        in_channels=512,
+        channels=128,
+    ),
+    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index abeeedf843..0000000000
--- a/configs/pspnet/pspnet_r18b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet18',
-    backbone=dict(type='ResNet', depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes.py b/configs/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index 284be6d09a..0000000000
--- a/configs/pspnet/pspnet_r18b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './pspnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(
-    pretrained='torchvision://resnet18',
-    backbone=dict(type='ResNet', depth=18),
-    decode_head=dict(
-        in_channels=512,
-        channels=128,
-    ),
-    auxiliary_head=dict(in_channels=256, channels=64))
diff --git a/configs/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r50-d32_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d32_512x1024_80k_cityscapes.py
rename to configs/pspnet/pspnet_r50-d32_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes.py b/configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d32_rsb-pretrain_512x1024_adamw_80k_cityscapes.py
rename to configs/pspnet/pspnet_r50-d32_rsb_4xb2-adamw-80k_cityscapes-512x1024.py
diff --git a/configs/pspnet/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes.py b/configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_rsb-pretrain_512x1024_adamw_80k_cityscapes.py
rename to configs/pspnet/pspnet_r50-d8-rsb_4xb2-adamw-80k_cityscapes-512x1024.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py b/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py
rename to configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x1024_40k_dark.py b/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x1024_40k_dark.py
rename to configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_dark-zurich-1920x1080.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x1024_40k_night_driving.py b/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x1024_40k_night_driving.py
rename to configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024_night-driving-1920x1080.py
diff --git a/configs/pspnet/pspnet_r50-d8_769x769_40k_cityscapes.py b/configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_769x769_40k_cityscapes.py
rename to configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x1024_80k_cityscapes.py
rename to configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x1024_80k_dark.py b/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x1024_80k_dark.py
rename to configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_dark-zurich-1920x1080.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x1024_80k_night_driving.py b/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x1024_80k_night_driving.py
rename to configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-512x1024_night-driving-1920x1080.py
diff --git a/configs/pspnet/pspnet_r50-d8_769x769_80k_cityscapes.py b/configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_769x769_80k_cityscapes.py
rename to configs/pspnet/pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_160k_ade20k.py b/configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_160k_ade20k.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-160k_ade20k-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k.py b/configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_4x4_160k_coco-stuff164k.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-160k_coco-stuff164k-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k.py b/configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_4x4_20k_coco-stuff10k.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-20k_coco-stuff10k-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_20k_voc12aug.py b/configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_20k_voc12aug.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k.py b/configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_4x4_320k_coco-stuff164k.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-320k_coco-stuff164k-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k.py b/configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_4x4_40k_coco-stuff10k.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-40k_coco-stuff10k-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_480x480_40k_pascal_context.py b/configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-480x480.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_480x480_40k_pascal_context.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-480x480.py
diff --git a/configs/pspnet/pspnet_r50-d8_480x480_40k_pascal_context_59.py b/configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_480x480_40k_pascal_context_59.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-40k_pascal-context-59-480x480.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_40k_voc12aug.py b/configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_40k_voc12aug.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_80k_ade20k.py b/configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_80k_ade20k.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k.py b/configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_4x4_80k_coco-stuff164k.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-80k_coco-stuff164k-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid.py b/configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_4x4_896x896_80k_isaid.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-80k_isaid-896x896.py
diff --git a/configs/pspnet/pspnet_r50-d8_512x512_80k_loveda.py b/configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_512x512_80k_loveda.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-80k_loveda-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_480x480_80k_pascal_context.py b/configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-480x480.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_480x480_80k_pascal_context.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-480x480.py
diff --git a/configs/pspnet/pspnet_r50-d8_480x480_80k_pascal_context_59.py b/configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-59-480x480.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_480x480_80k_pascal_context_59.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-80k_pascal-context-59-480x480.py
diff --git a/configs/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam.py b/configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_4x4_512x512_80k_potsdam.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-80k_potsdam-512x512.py
diff --git a/configs/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen.py b/configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py
similarity index 100%
rename from configs/pspnet/pspnet_r50-d8_4x4_512x512_80k_vaihingen.py
rename to configs/pspnet/pspnet_r50-d8_4xb4-80k_vaihingen-512x512.py
diff --git a/configs/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/pspnet/pspnet_r50b-d32_512x1024_80k_cityscapes.py
rename to configs/pspnet/pspnet_r50b-d32_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py b/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..7dd64b332f
--- /dev/null
+++ b/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py b/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..3875c092fe
--- /dev/null
+++ b/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './pspnet_r50-d8_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes.py b/configs/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 946bf4fc84..0000000000
--- a/configs/pspnet/pspnet_r50b-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_512x1024_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes.py b/configs/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes.py
deleted file mode 100644
index b6087dcf9f..0000000000
--- a/configs/pspnet/pspnet_r50b-d8_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './pspnet_r50-d8_769x769_80k_cityscapes.py'
-model = dict(pretrained='torchvision://resnet50', backbone=dict(type='ResNet'))
diff --git a/configs/resnest/README.md b/configs/resnest/README.md
index 1b116dc5ea..304791abe9 100644
--- a/configs/resnest/README.md
+++ b/configs/resnest/README.md
@@ -1,6 +1,6 @@
 # ResNeSt
 
-[ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955)
+> [ResNeSt: Split-Attention Networks](https://arxiv.org/abs/2004.08955)
 
 ## Introduction
 
@@ -22,6 +22,26 @@ It is well known that featuremap attention and multi-path representation are imp
 <img src="https://user-images.githubusercontent.com/24582831/142902526-3cf33345-7e40-47a6-985e-4381857e21df.png" width="60%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | S-101-D8 | 512x1024  |   80000 |     11.4 | 2.39           | V100   | 77.56 | 78.98         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes_20200807_140631-f8d155b3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json)                                         |
+| PSPNet     | S-101-D8 | 512x1024  |   80000 |     11.8 | 2.52           | V100   | 78.57 | 79.19         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes_20200807_140631-c75f3b99.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json)                             |
+| DeepLabV3  | S-101-D8 | 512x1024  |   80000 |     11.9 | 1.88           | V100   | 79.67 | 80.51         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes_20200807_144429-b73c4270.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json)                 |
+| DeepLabV3+ | S-101-D8 | 512x1024  |   80000 |     13.2 | 2.36           | V100   | 79.62 | 80.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes_20200807_144429-1239eb43.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json) |
+
+### ADE20K
+
+| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                               |
+| ---------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN        | S-101-D8 | 512x512   |  160000 |     14.2 | 12.86          | V100   | 45.62 | 46.16         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k_20200807_145416-d3160329.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k-20200807_145416.log.json)                                         |
+| PSPNet     | S-101-D8 | 512x512   |  160000 |     14.2 | 13.02          | V100   | 45.44 | 46.28         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k_20200807_145416-a6daa92a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k-20200807_145416.log.json)                             |
+| DeepLabV3  | S-101-D8 | 512x512   |  160000 |     14.6 | 9.28           | V100   | 45.71 | 46.59         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k_20200807_144503-17ecabe5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k-20200807_144503.log.json)                 |
+| DeepLabV3+ | S-101-D8 | 512x512   |  160000 |     16.2 | 11.96          | V100   | 46.47 | 47.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k_20200807_144503-27b26226.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k-20200807_144503.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -32,23 +52,3 @@ journal={arXiv preprint arXiv:2004.08955},
 year={2020}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                               |
-| ---------- | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN        | S-101-D8 | 512x1024  |   80000 |     11.4 | 2.39           | 77.56 | 78.98         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest/fcn_s101-d8_512x1024_80k_cityscapes.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes_20200807_140631-f8d155b3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json)                                         |
-| PSPNet     | S-101-D8 | 512x1024  |   80000 |     11.8 | 2.52           | 78.57 | 79.19         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest/pspnet_s101-d8_512x1024_80k_cityscapes.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes_20200807_140631-c75f3b99.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json)                             |
-| DeepLabV3  | S-101-D8 | 512x1024  |   80000 |     11.9 | 1.88           | 79.67 | 80.51         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes_20200807_144429-b73c4270.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json)                 |
-| DeepLabV3+ | S-101-D8 | 512x1024  |   80000 |     13.2 | 2.36           | 79.62 | 80.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes_20200807_144429-1239eb43.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json) |
-
-### ADE20K
-
-| Method     | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                               |
-| ---------- | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FCN        | S-101-D8 | 512x512   |  160000 |     14.2 | 12.86          | 45.62 | 46.16         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest/fcn_s101-d8_512x512_160k_ade20k.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k_20200807_145416-d3160329.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k-20200807_145416.log.json)                                         |
-| PSPNet     | S-101-D8 | 512x512   |  160000 |     14.2 | 13.02          | 45.44 | 46.28         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest/pspnet_s101-d8_512x512_160k_ade20k.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k_20200807_145416-a6daa92a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k-20200807_145416.log.json)                             |
-| DeepLabV3  | S-101-D8 | 512x512   |  160000 |     14.6 | 9.28           | 45.71 | 46.59         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest/deeplabv3_s101-d8_512x512_160k_ade20k.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k_20200807_144503-17ecabe5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k-20200807_144503.log.json)                 |
-| DeepLabV3+ | S-101-D8 | 512x512   |  160000 |     16.2 | 11.96          | 46.47 | 47.27         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k_20200807_144503-27b26226.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k-20200807_144503.log.json) |
diff --git a/configs/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes.py b/configs/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index f98398690e..0000000000
--- a/configs/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = '../deeplabv3/deeplabv3_r101-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnest101',
-    backbone=dict(
-        type='ResNeSt',
-        stem_channels=128,
-        radix=2,
-        reduction_factor=4,
-        avg_down_stride=True))
diff --git a/configs/resnest/deeplabv3_s101-d8_512x512_160k_ade20k.py b/configs/resnest/deeplabv3_s101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index e3924ad679..0000000000
--- a/configs/resnest/deeplabv3_s101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = '../deeplabv3/deeplabv3_r101-d8_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://resnest101',
-    backbone=dict(
-        type='ResNeSt',
-        stem_channels=128,
-        radix=2,
-        reduction_factor=4,
-        avg_down_stride=True))
diff --git a/configs/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes.py b/configs/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 69bef72383..0000000000
--- a/configs/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnest101',
-    backbone=dict(
-        type='ResNeSt',
-        stem_channels=128,
-        radix=2,
-        reduction_factor=4,
-        avg_down_stride=True))
diff --git a/configs/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k.py b/configs/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index d51bccb965..0000000000
--- a/configs/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://resnest101',
-    backbone=dict(
-        type='ResNeSt',
-        stem_channels=128,
-        radix=2,
-        reduction_factor=4,
-        avg_down_stride=True))
diff --git a/configs/resnest/fcn_s101-d8_512x1024_80k_cityscapes.py b/configs/resnest/fcn_s101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 33fa0252d8..0000000000
--- a/configs/resnest/fcn_s101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = '../fcn/fcn_r101-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnest101',
-    backbone=dict(
-        type='ResNeSt',
-        stem_channels=128,
-        radix=2,
-        reduction_factor=4,
-        avg_down_stride=True))
diff --git a/configs/resnest/fcn_s101-d8_512x512_160k_ade20k.py b/configs/resnest/fcn_s101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index dcee8c280e..0000000000
--- a/configs/resnest/fcn_s101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = '../fcn/fcn_r101-d8_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://resnest101',
-    backbone=dict(
-        type='ResNeSt',
-        stem_channels=128,
-        radix=2,
-        reduction_factor=4,
-        avg_down_stride=True))
diff --git a/configs/resnest/metafile.yaml b/configs/resnest/metafile.yaml
new file mode 100644
index 0000000000..0b8d41ebfd
--- /dev/null
+++ b/configs/resnest/metafile.yaml
@@ -0,0 +1,193 @@
+Models:
+- Name: resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.56
+      mIoU(ms+flip): 78.98
+  Config: configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - S-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes_20200807_140631-f8d155b3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.57
+      mIoU(ms+flip): 79.19
+  Config: configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - S-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes_20200807_140631-c75f3b99.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes-20200807_140631.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.67
+      mIoU(ms+flip): 80.51
+  Config: configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - S-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 11.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes_20200807_144429-b73c4270.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.62
+      mIoU(ms+flip): 80.27
+  Config: configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - S-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 13.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes_20200807_144429-1239eb43.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes-20200807_144429.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512
+  In Collection: FCN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.62
+      mIoU(ms+flip): 46.16
+  Config: configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - S-101-D8
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 14.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k_20200807_145416-d3160329.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k-20200807_145416.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512
+  In Collection: PSPNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.44
+      mIoU(ms+flip): 46.28
+  Config: configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - S-101-D8
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 14.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k_20200807_145416-a6daa92a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k-20200807_145416.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.71
+      mIoU(ms+flip): 46.59
+  Config: configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - S-101-D8
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 14.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k_20200807_144503-17ecabe5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k-20200807_144503.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
+- Name: resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512
+  In Collection: DeepLabV3+
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.47
+      mIoU(ms+flip): 47.27
+  Config: configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - S-101-D8
+    - DeepLabV3+
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 16.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k_20200807_144503-27b26226.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k-20200807_144503.log.json
+  Paper:
+    Title: 'ResNeSt: Split-Attention Networks'
+    URL: https://arxiv.org/abs/2004.08955
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/resnest.py#L271
+  Framework: PyTorch
diff --git a/configs/resnest/pspnet_s101-d8_512x1024_80k_cityscapes.py b/configs/resnest/pspnet_s101-d8_512x1024_80k_cityscapes.py
deleted file mode 100644
index 9737849cbd..0000000000
--- a/configs/resnest/pspnet_s101-d8_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = '../pspnet/pspnet_r101-d8_512x1024_80k_cityscapes.py'
-model = dict(
-    pretrained='open-mmlab://resnest101',
-    backbone=dict(
-        type='ResNeSt',
-        stem_channels=128,
-        radix=2,
-        reduction_factor=4,
-        avg_down_stride=True))
diff --git a/configs/resnest/pspnet_s101-d8_512x512_160k_ade20k.py b/configs/resnest/pspnet_s101-d8_512x512_160k_ade20k.py
deleted file mode 100644
index 6a622eae96..0000000000
--- a/configs/resnest/pspnet_s101-d8_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = '../pspnet/pspnet_r101-d8_512x512_160k_ade20k.py'
-model = dict(
-    pretrained='open-mmlab://resnest101',
-    backbone=dict(
-        type='ResNeSt',
-        stem_channels=128,
-        radix=2,
-        reduction_factor=4,
-        avg_down_stride=True))
diff --git a/configs/resnest/resnest.yml b/configs/resnest/resnest.yml
deleted file mode 100644
index b2ca2590b8..0000000000
--- a/configs/resnest/resnest.yml
+++ /dev/null
@@ -1,177 +0,0 @@
-Models:
-- Name: fcn_s101-d8_512x1024_80k_cityscapes
-  In Collection: FCN
-  Metadata:
-    backbone: S-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 418.41
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 11.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.56
-      mIoU(ms+flip): 78.98
-  Config: configs/resnest/fcn_s101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x1024_80k_cityscapes/fcn_s101-d8_512x1024_80k_cityscapes_20200807_140631-f8d155b3.pth
-- Name: pspnet_s101-d8_512x1024_80k_cityscapes
-  In Collection: PSPNet
-  Metadata:
-    backbone: S-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 396.83
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 11.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.57
-      mIoU(ms+flip): 79.19
-  Config: configs/resnest/pspnet_s101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x1024_80k_cityscapes/pspnet_s101-d8_512x1024_80k_cityscapes_20200807_140631-c75f3b99.pth
-- Name: deeplabv3_s101-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: S-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 531.91
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 11.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.67
-      mIoU(ms+flip): 80.51
-  Config: configs/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x1024_80k_cityscapes/deeplabv3_s101-d8_512x1024_80k_cityscapes_20200807_144429-b73c4270.pth
-- Name: deeplabv3plus_s101-d8_512x1024_80k_cityscapes
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: S-101-D8
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 423.73
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 13.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.62
-      mIoU(ms+flip): 80.27
-  Config: configs/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x1024_80k_cityscapes/deeplabv3plus_s101-d8_512x1024_80k_cityscapes_20200807_144429-1239eb43.pth
-- Name: fcn_s101-d8_512x512_160k_ade20k
-  In Collection: FCN
-  Metadata:
-    backbone: S-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 77.76
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 14.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.62
-      mIoU(ms+flip): 46.16
-  Config: configs/resnest/fcn_s101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/fcn_s101-d8_512x512_160k_ade20k/fcn_s101-d8_512x512_160k_ade20k_20200807_145416-d3160329.pth
-- Name: pspnet_s101-d8_512x512_160k_ade20k
-  In Collection: PSPNet
-  Metadata:
-    backbone: S-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 76.8
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 14.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.44
-      mIoU(ms+flip): 46.28
-  Config: configs/resnest/pspnet_s101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/pspnet_s101-d8_512x512_160k_ade20k/pspnet_s101-d8_512x512_160k_ade20k_20200807_145416-a6daa92a.pth
-- Name: deeplabv3_s101-d8_512x512_160k_ade20k
-  In Collection: DeepLabV3
-  Metadata:
-    backbone: S-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 107.76
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 14.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.71
-      mIoU(ms+flip): 46.59
-  Config: configs/resnest/deeplabv3_s101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3_s101-d8_512x512_160k_ade20k/deeplabv3_s101-d8_512x512_160k_ade20k_20200807_144503-17ecabe5.pth
-- Name: deeplabv3plus_s101-d8_512x512_160k_ade20k
-  In Collection: DeepLabV3+
-  Metadata:
-    backbone: S-101-D8
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 83.61
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 16.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.47
-      mIoU(ms+flip): 47.27
-  Config: configs/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/resnest/deeplabv3plus_s101-d8_512x512_160k_ade20k/deeplabv3plus_s101-d8_512x512_160k_ade20k_20200807_144503-27b26226.pth
diff --git a/configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py b/configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..7ece894b56
--- /dev/null
+++ b/configs/resnest/resnest_s101-d8_deeplabv3_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = '../deeplabv3/deeplabv3_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py b/configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..c2852301fc
--- /dev/null
+++ b/configs/resnest/resnest_s101-d8_deeplabv3_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = '../deeplabv3/deeplabv3_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py b/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..5c43a9547d
--- /dev/null
+++ b/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024.py'  # noqa
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py b/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..ce39d3709f
--- /dev/null
+++ b/configs/resnest/resnest_s101-d8_deeplabv3plus_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = '../deeplabv3plus/deeplabv3plus_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py b/configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..fc333e4ff0
--- /dev/null
+++ b/configs/resnest/resnest_s101-d8_fcn_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,9 @@
+_base_ = '../fcn/fcn_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py b/configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..af12733444
--- /dev/null
+++ b/configs/resnest/resnest_s101-d8_fcn_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = '../fcn/fcn_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py b/configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py
new file mode 100644
index 0000000000..3aab524449
--- /dev/null
+++ b/configs/resnest/resnest_s101-d8_pspnet_4xb2-80k_cityscapes512x1024.py
@@ -0,0 +1,9 @@
+_base_ = '../pspnet/pspnet_r101-d8_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py b/configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..66e6639c18
--- /dev/null
+++ b/configs/resnest/resnest_s101-d8_pspnet_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = '../pspnet/pspnet_r101-d8_4xb4-160k_ade20k-512x512.py'
+model = dict(
+    pretrained='open-mmlab://resnest101',
+    backbone=dict(
+        type='ResNeSt',
+        stem_channels=128,
+        radix=2,
+        reduction_factor=4,
+        avg_down_stride=True))
diff --git a/configs/san/README.md b/configs/san/README.md
new file mode 100644
index 0000000000..23e72aa65f
--- /dev/null
+++ b/configs/san/README.md
@@ -0,0 +1,47 @@
+# SAN
+
+> [Side Adapter Network for Open-Vocabulary Semantic Segmentation](https://arxiv.org/abs/2302.12242)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/MendelXu/SAN">Official Repo</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+This paper presents a new framework for open-vocabulary semantic segmentation with the pre-trained vision-language model, named Side Adapter Network (SAN). Our approach models the semantic segmentation task as a region recognition problem. A side network is attached to a frozen CLIP model with two branches: one for predicting mask proposals, and the other for predicting attention bias which is applied in the CLIP model to recognize the class of masks. This decoupled design has the benefit CLIP in recognizing the class of mask proposals. Since the attached side network can reuse CLIP features, it can be very light. In addition, the entire network can be trained end-to-end, allowing the side network to be adapted to the frozen CLIP model, which makes the predicted mask proposals CLIP-aware. Our approach is fast, accurate, and only adds a few additional trainable parameters. We evaluate our approach on multiple semantic segmentation benchmarks. Our method significantly outperforms other counterparts, with up to 18 times fewer trainable parameters and 19 times faster inference speed. We hope our approach will serve as a solid baseline and help ease future research in open-vocabulary semantic segmentation.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/MendelXu/SAN/blob/main/resources/arch.png" width="800"/>
+</div>
+
+## Results and models
+
+### COCO-Stuff164k
+
+| Method | Backbone | Pretrained   | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config | download                                                                                                                                                                                    |
+| ------ | -------- | ------------ | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| SAN    | ViT-B_16 | CLIP_ViT-B16 | 640x640   | 60000   | 12.61    | -              | V100   | 41.93 | 41.77         | -      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-b16_20230906-fd0a7684.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-b16_20230906.log) |
+| SAN    | ViT-L_14 | CLIP_ViT-L14 | 640x640   | 60000   | 22.84    | -              | V100   | 45.78 | 43.99         | -      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-l14_20230907-a11e098f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-l14_20230907.log) |
+
+## Notes
+
+git push
+The pretrained weights in config files are converted from open_clip models using tools/model_converters/clip2mmseg.py.
+
+## Citation
+
+```bibtex
+@inproceedings{xu2023side,
+  title={Side adapter network for open-vocabulary semantic segmentation},
+  author={Xu, Mengde and Zhang, Zheng and Wei, Fangyun and Hu, Han and Bai, Xiang},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={2945--2954},
+  year={2023}
+}
+```
diff --git a/configs/san/metafile.yaml b/configs/san/metafile.yaml
new file mode 100644
index 0000000000..117d088af0
--- /dev/null
+++ b/configs/san/metafile.yaml
@@ -0,0 +1,61 @@
+Collections:
+- Name: SAN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - COCO-Stuff 164k
+  Paper:
+    Title: 'Side Adapter Network for Open-Vocabulary Semantic Segmentation'
+    URL: https://arxiv.org/abs/2302.12242
+  README: configs/san/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: san-vit-b16_coco-stuff164k-640x640
+  In Collection: SAN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 41.93
+      mIoU(ms+flip): 41.77
+  Config: configs/san/san-vit-b16_coco-stuff164k-640x640.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - SAN
+    - ViT
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 12.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-b16_20230906-fd0a7684.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-b16_20230906.log
+  Paper:
+    Title: 'Side Adapter Network for Open-Vocabulary Semantic Segmentation'
+    URL: https://arxiv.org/abs/2302.12242
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/mmseg/models/decode_heads/san_head.py#L470
+  Framework: PyTorch
+- Name: san-vit-l14_coco-stuff164k-640x640
+  In Collection: SAN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: COCO-Stuff 164k
+    Metrics:
+      mIoU: 45.78
+      mIoU(ms+flip): 43.99
+  Config: configs/san/san-vit-l14_coco-stuff164k-640x640.py
+  Metadata:
+    Training Data: COCO-Stuff 164k
+    Batch Size: 16
+    Architecture:
+    - SAN
+    - ViT
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 12.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-l14_20230907-a11e098f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/san/san-vit-l14_20230907.log
+  Paper:
+    Title: 'Side Adapter Network for Open-Vocabulary Semantic Segmentation'
+    URL: https://arxiv.org/abs/2302.12242
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/mmseg/models/decode_heads/san_head.py#L470
+  Framework: PyTorch
diff --git a/configs/san/san-vit-b16_coco-stuff164k-640x640.py b/configs/san/san-vit-b16_coco-stuff164k-640x640.py
new file mode 100644
index 0000000000..40592486d1
--- /dev/null
+++ b/configs/san/san-vit-b16_coco-stuff164k-640x640.py
@@ -0,0 +1,82 @@
+_base_ = [
+    '../_base_/models/san_vit-b16.py', '../_base_/datasets/coco-stuff164k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomChoiceResize',
+        scales=[int(640 * x * 0.1) for x in range(5, 16)],
+        resize_type='ResizeShortestEdge',
+        max_size=2560),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=1.0),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackSegInputs')
+]
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeShortestEdge', scale=crop_size, max_size=2560),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+
+# By default, models are trained on 4 GPUs with 8 images per GPU
+train_dataloader = dict(batch_size=8, dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/san/clip_vit-base-patch16-224_3rdparty-d08f8887.pth'  # noqa
+data_preprocessor = dict(
+    mean=[122.7709, 116.7460, 104.0937],
+    std=[68.5005, 66.6322, 70.3232],
+    size_divisor=640,
+    test_cfg=dict(size_divisor=32))
+model = dict(
+    pretrained=pretrained,
+    text_encoder=dict(dataset_name='coco-stuff164k'),
+    decode_head=dict(num_classes=171))
+
+# training schedule for 60k
+train_cfg = dict(
+    type='IterBasedTrainLoop',
+    max_iters=60000,
+    val_interval=500,
+    val_begin=55000)
+default_hooks = dict(
+    checkpoint=dict(
+        type='CheckpointHook',
+        by_epoch=False,
+        interval=10000,
+        save_best='mIoU'))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={
+            'img_encoder': dict(lr_mult=0.1, decay_mult=1.0),
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }),
+    loss_scale='dynamic',
+    clip_grad=dict(max_norm=0.01, norm_type=2))
+
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=0,
+        end=60000,
+        by_epoch=False,
+    )
+]
diff --git a/configs/san/san-vit-b16_pascal_context-640x640.py b/configs/san/san-vit-b16_pascal_context-640x640.py
new file mode 100644
index 0000000000..b164fe41fd
--- /dev/null
+++ b/configs/san/san-vit-b16_pascal_context-640x640.py
@@ -0,0 +1,56 @@
+_base_ = [
+    '../_base_/models/san_vit-b16.py',
+    '../_base_/datasets/pascal_context_59.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeShortestEdge', scale=crop_size, max_size=2560),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+data_preprocessor = dict(
+    mean=[122.7709, 116.7460, 104.0937],
+    std=[68.5005, 66.6322, 70.3232],
+    size_divisor=640,
+    test_cfg=dict(size_divisor=32))
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    text_encoder=dict(dataset_name='pascal_context'),
+    decode_head=dict(num_classes=59))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/configs/san/san-vit-b16_voc12aug-640x640.py b/configs/san/san-vit-b16_voc12aug-640x640.py
new file mode 100644
index 0000000000..62e9b26f0a
--- /dev/null
+++ b/configs/san/san-vit-b16_voc12aug-640x640.py
@@ -0,0 +1,65 @@
+_base_ = [
+    '../_base_/models/san_vit-b16.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (640, 640)
+
+metainfo = dict(
+    classes=('aeroplane', 'bicycle', 'bird', 'boat', 'bottle', 'bus', 'car',
+             'cat', 'chair', 'cow', 'diningtable', 'dog', 'horse', 'motorbike',
+             'person', 'pottedplant', 'sheep', 'sofa', 'train', 'tvmonitor'),
+    palette=[[128, 0, 0], [0, 128, 0], [128, 128, 0], [0, 0, 128],
+             [128, 0, 128], [0, 128, 128], [128, 128, 128], [64, 0, 0],
+             [192, 0, 0], [64, 128, 0], [192, 128, 0], [64, 0, 128],
+             [192, 0, 128], [64, 128, 128], [192, 128, 128], [0, 64, 0],
+             [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]])
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='ResizeShortestEdge', scale=crop_size, max_size=2560),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(
+    batch_size=1, dataset=dict(metainfo=metainfo, pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+data_preprocessor = dict(
+    mean=[122.7709, 116.7460, 104.0937],
+    std=[68.5005, 66.6322, 70.3232],
+    size_divisor=640,
+    test_cfg=dict(size_divisor=32))
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='pretrain/vit_base_patch16_224.pth',
+    text_encoder=dict(dataset_name='voc'),
+    decode_head=dict(num_classes=20))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
diff --git a/configs/san/san-vit-l14_coco-stuff164k-640x640.py b/configs/san/san-vit-l14_coco-stuff164k-640x640.py
new file mode 100644
index 0000000000..c34328db3f
--- /dev/null
+++ b/configs/san/san-vit-l14_coco-stuff164k-640x640.py
@@ -0,0 +1,36 @@
+_base_ = ['./san-vit-b16_coco-stuff164k-640x640.py']
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/san/clip_vit-large-patch14-336_3rdparty-0b5df9cb.pth'  # noqa
+model = dict(
+    type='MultimodalEncoderDecoder',
+    pretrained=pretrained,
+    encoder_resolution=0.7,
+    image_encoder=dict(
+        type='VisionTransformer',
+        img_size=(336, 336),
+        patch_size=14,
+        patch_pad=0,
+        embed_dims=1024,
+        num_layers=18,
+        num_heads=16,
+        out_indices=(5, 11, 17),
+    ),
+    text_encoder=dict(
+        type='CLIPTextEncoder',
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        output_dims=768,
+    ),
+    decode_head=dict(
+        type='SideAdapterCLIPHead',
+        san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)),
+        maskgen_cfg=dict(
+            num_layers=6,
+            embed_dims=1024,
+            num_heads=16,
+            out_dims=768,
+        )))
+
+# By default, models are trained on 8 GPUs with 4 images per GPU
+train_dataloader = dict(batch_size=4)
diff --git a/configs/san/san-vit-l14_pascal_context-640x640.py b/configs/san/san-vit-l14_pascal_context-640x640.py
new file mode 100644
index 0000000000..a9545fac8e
--- /dev/null
+++ b/configs/san/san-vit-l14_pascal_context-640x640.py
@@ -0,0 +1,32 @@
+_base_ = ['./san-vit-b16_pascal_context-640x640.py']
+
+model = dict(
+    type='MultimodalEncoderDecoder',
+    pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
+    encoder_resolution=0.7,
+    image_encoder=dict(
+        type='VisionTransformer',
+        img_size=(336, 336),
+        patch_size=14,
+        patch_pad=0,
+        embed_dims=1024,
+        num_layers=18,
+        num_heads=16,
+        out_indices=(5, 11, 17),
+    ),
+    text_encoder=dict(
+        type='CLIPTextEncoder',
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        output_dims=768,
+    ),
+    decode_head=dict(
+        type='SideAdapterCLIPHead',
+        san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)),
+        maskgen_cfg=dict(
+            num_layers=6,
+            embed_dims=1024,
+            num_heads=16,
+            out_dims=768,
+        )))
diff --git a/configs/san/san-vit-l14_voc12aug-640x640.py b/configs/san/san-vit-l14_voc12aug-640x640.py
new file mode 100644
index 0000000000..2f37715039
--- /dev/null
+++ b/configs/san/san-vit-l14_voc12aug-640x640.py
@@ -0,0 +1,32 @@
+_base_ = ['./san-vit-b16_voc12aug-640x640.py']
+
+model = dict(
+    type='MultimodalEncoderDecoder',
+    pretrained='pretrain/jx_vit_base_p16_224-80ecf9dd.pth',
+    encoder_resolution=0.7,
+    image_encoder=dict(
+        type='VisionTransformer',
+        img_size=(336, 336),
+        patch_size=14,
+        patch_pad=0,
+        embed_dims=1024,
+        num_layers=18,
+        num_heads=16,
+        out_indices=(5, 11, 17),
+    ),
+    text_encoder=dict(
+        type='CLIPTextEncoder',
+        embed_dims=768,
+        num_layers=12,
+        num_heads=12,
+        output_dims=768,
+    ),
+    decode_head=dict(
+        type='SideAdapterCLIPHead',
+        san_cfg=dict(clip_channels=1024, cfg_decoder=dict(num_heads=16)),
+        maskgen_cfg=dict(
+            num_layers=6,
+            embed_dims=1024,
+            num_heads=16,
+            out_dims=768,
+        )))
diff --git a/configs/segformer/README.md b/configs/segformer/README.md
index 5ac6f36968..f8999b0efa 100644
--- a/configs/segformer/README.md
+++ b/configs/segformer/README.md
@@ -1,6 +1,6 @@
 # SegFormer
 
-[SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
+> [SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers](https://arxiv.org/abs/2105.15203)
 
 ## Introduction
 
@@ -22,17 +22,6 @@ We present SegFormer, a simple, efficient yet powerful semantic segmentation fra
 <img src="https://user-images.githubusercontent.com/24582831/142902600-e188073e-5744-4ba9-8dbf-9316e55c74aa.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{xie2021segformer,
-  title={SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers},
-  author={Xie, Enze and Wang, Wenhai and Yu, Zhiding and Anandkumar, Anima and Alvarez, Jose M and Luo, Ping},
-  journal={arXiv preprint arXiv:2105.15203},
-  year={2021}
-}
-```
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
@@ -49,15 +38,15 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in
 
 ### ADE20K
 
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                               |
-| --------- | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| Segformer | MIT-B0   | 512x512   |  160000 |      2.1 | 51.32          | 37.41 | 38.34         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530.log.json) |
-| Segformer | MIT-B1   | 512x512   |  160000 |      2.6 | 47.66          | 40.97 | 42.54         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106.log.json) |
-| Segformer | MIT-B2   | 512x512   |  160000 |      3.6 | 30.88          | 45.58 | 47.03         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103.log.json) |
-| Segformer | MIT-B3   | 512x512   |  160000 |      4.8 | 22.11          | 47.82 | 48.81         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410.log.json) |
-| Segformer | MIT-B4   | 512x512   |  160000 |      6.1 | 15.45          | 48.46 | 49.76         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055.log.json) |
-| Segformer | MIT-B5   | 512x512   |  160000 |      7.2 | 11.89          | 49.13 | 50.22         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235.log.json) |
-| Segformer | MIT-B5   | 640x640   |  160000 |     11.5 | 11.30          | 49.62 | 50.36         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243.log.json) |
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device   |  mIoU | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                               |
+| --------- | -------- | --------- | ------: | -------: | -------------- | -------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Segformer | MIT-B0   | 512x512   |  160000 |      2.1 | 51.32          | 1080 Ti  | 37.41 | 38.34         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530.log.json) |
+| Segformer | MIT-B1   | 512x512   |  160000 |      2.6 | 47.66          | TITAN Xp | 40.97 | 42.54         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106.log.json) |
+| Segformer | MIT-B2   | 512x512   |  160000 |      3.6 | 30.88          | TITAN Xp | 45.58 | 47.03         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103.log.json) |
+| Segformer | MIT-B3   | 512x512   |  160000 |      4.8 | 22.11          | V100     | 47.82 | 48.81         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410.log.json) |
+| Segformer | MIT-B4   | 512x512   |  160000 |      6.1 | 15.45          | V100     | 48.46 | 49.76         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055.log.json) |
+| Segformer | MIT-B5   | 512x512   |  160000 |      7.2 | 11.89          | V100     | 49.13 | 50.22         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235.log.json) |
+| Segformer | MIT-B5   | 640x640   |  160000 |     11.5 | 11.30          | V100     | 49.62 | 50.36         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243.log.json) |
 
 Evaluation with AlignedResize:
 
@@ -77,20 +66,13 @@ using `AlignedResize`, you can change the dataset pipeline like this:
 ```python
 test_pipeline = [
     dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(2048, 512),
-        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            # resize image to multiple of 32, improve SegFormer by 0.5-1.0 mIoU.
-            dict(type='ResizeToMultiple', size_divisor=32),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # resize image to multiple of 32, improve SegFormer by 0.5-1.0 mIoU.
+    dict(type='ResizeToMultiple', size_divisor=32),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
 ]
 ```
 
@@ -98,11 +80,22 @@ test_pipeline = [
 
 The lower fps result is caused by the sliding window inference scheme (window size:1024x1024).
 
-| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                                                                       |
-| --------- | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | -------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Segformer | MIT-B0   | 1024x1024 |  160000 |     3.64 | 4.74           | 76.54 | 78.22         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857-e7f88502.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857.log.json) |
-| Segformer | MIT-B1   | 1024x1024 |  160000 |     4.49 | 4.3            | 78.56 | 79.73         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213-655c7b3f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213.log.json) |
-| Segformer | MIT-B2   | 1024x1024 |  160000 |     7.42 | 3.36           | 81.08 | 82.18         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205-6096669a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205.log.json) |
-| Segformer | MIT-B3   | 1024x1024 |  160000 |    10.86 | 2.53           | 81.94 | 83.14         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823-a8f8a177.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823.log.json) |
-| Segformer | MIT-B4   | 1024x1024 |  160000 |    15.07 | 1.88           | 81.89 | 83.38         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709-07f6c333.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709.log.json) |
-| Segformer | MIT-B5   | 1024x1024 |  160000 |    18.00 | 1.39           | 82.25 | 83.48         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934-87a052ec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934.log.json) |
+| Method    | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                                | download                                                                                                                                                                                                                                                                                                                                                                                       |
+| --------- | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Segformer | MIT-B0   | 1024x1024 |  160000 |     3.64 | 4.74           | V100   | 76.54 | 78.22         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857-e7f88502.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857.log.json) |
+| Segformer | MIT-B1   | 1024x1024 |  160000 |     4.49 | 4.3            | V100   | 78.56 | 79.73         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213-655c7b3f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213.log.json) |
+| Segformer | MIT-B2   | 1024x1024 |  160000 |     7.42 | 3.36           | V100   | 81.08 | 82.18         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205-6096669a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205.log.json) |
+| Segformer | MIT-B3   | 1024x1024 |  160000 |    10.86 | 2.53           | V100   | 81.94 | 83.14         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823-a8f8a177.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823.log.json) |
+| Segformer | MIT-B4   | 1024x1024 |  160000 |    15.07 | 1.88           | V100   | 81.89 | 83.38         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709-07f6c333.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709.log.json) |
+| Segformer | MIT-B5   | 1024x1024 |  160000 |    18.00 | 1.39           | V100   | 82.25 | 83.48         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934-87a052ec.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934.log.json) |
+
+## Citation
+
+```bibtex
+@article{xie2021segformer,
+  title={SegFormer: Simple and Efficient Design for Semantic Segmentation with Transformers},
+  author={Xie, Enze and Wang, Wenhai and Yu, Zhiding and Anandkumar, Anima and Alvarez, Jose M and Luo, Ping},
+  journal={arXiv preprint arXiv:2105.15203},
+  year={2021}
+}
+```
diff --git a/configs/segformer/metafile.yaml b/configs/segformer/metafile.yaml
new file mode 100644
index 0000000000..7fb38d745b
--- /dev/null
+++ b/configs/segformer/metafile.yaml
@@ -0,0 +1,340 @@
+Collections:
+- Name: Segformer
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+    - Cityscapes
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  README: configs/segformer/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: segformer_mit-b0_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.41
+      mIoU(ms+flip): 38.34
+  Config: configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B0
+    - Segformer
+    Training Resources: 8x 1080 Ti GPUS
+    Memory (GB): 2.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b1_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.97
+      mIoU(ms+flip): 42.54
+  Config: configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B1
+    - Segformer
+    Training Resources: 8x TITAN Xp GPUS
+    Memory (GB): 2.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b2_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.58
+      mIoU(ms+flip): 47.03
+  Config: configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B2
+    - Segformer
+    Training Resources: 8x TITAN Xp GPUS
+    Memory (GB): 3.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b3_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.82
+      mIoU(ms+flip): 48.81
+  Config: configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B3
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b4_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.46
+      mIoU(ms+flip): 49.76
+  Config: configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B4
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b5_8xb2-160k_ade20k-512x512
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.13
+      mIoU(ms+flip): 50.22
+  Config: configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B5
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b5_8xb2-160k_ade20k-640x640
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.62
+      mIoU(ms+flip): 50.36
+  Config: configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MIT-B5
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 11.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b0_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.54
+      mIoU(ms+flip): 78.22
+  Config: configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B0
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 3.64
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857-e7f88502.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b1_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.56
+      mIoU(ms+flip): 79.73
+  Config: configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B1
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.49
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213-655c7b3f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b2_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.08
+      mIoU(ms+flip): 82.18
+  Config: configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B2
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.42
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205-6096669a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b3_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.94
+      mIoU(ms+flip): 83.14
+  Config: configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B3
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 10.86
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823-a8f8a177.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b4_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 81.89
+      mIoU(ms+flip): 83.38
+  Config: configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B4
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 15.07
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709-07f6c333.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
+- Name: segformer_mit-b5_8xb1-160k_cityscapes-1024x1024
+  In Collection: Segformer
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 82.25
+      mIoU(ms+flip): 83.48
+  Config: configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - MIT-B5
+    - Segformer
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 18.0
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934-87a052ec.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934.log.json
+  Paper:
+    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
+      Transformers'
+    URL: https://arxiv.org/abs/2105.15203
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
+  Framework: PyTorch
diff --git a/configs/segformer/segformer.yml b/configs/segformer/segformer.yml
deleted file mode 100644
index d28cb16265..0000000000
--- a/configs/segformer/segformer.yml
+++ /dev/null
@@ -1,303 +0,0 @@
-Collections:
-- Name: Segformer
-  Metadata:
-    Training Data:
-    - ADE20K
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/2105.15203
-    Title: 'SegFormer: Simple and Efficient Design for Semantic Segmentation with
-      Transformers'
-  README: configs/segformer/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/mit.py#L246
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/NVlabs/SegFormer
-Models:
-- Name: segformer_mit-b0_512x512_160k_ade20k
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B0
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 19.49
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 2.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 37.41
-      mIoU(ms+flip): 38.34
-  Config: configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_512x512_160k_ade20k/segformer_mit-b0_512x512_160k_ade20k_20210726_101530-8ffa8fda.pth
-- Name: segformer_mit-b1_512x512_160k_ade20k
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B1
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 20.98
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 2.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 40.97
-      mIoU(ms+flip): 42.54
-  Config: configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_512x512_160k_ade20k/segformer_mit-b1_512x512_160k_ade20k_20210726_112106-d70e859d.pth
-- Name: segformer_mit-b2_512x512_160k_ade20k
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B2
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 32.38
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 3.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.58
-      mIoU(ms+flip): 47.03
-  Config: configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_512x512_160k_ade20k/segformer_mit-b2_512x512_160k_ade20k_20210726_112103-cbd414ac.pth
-- Name: segformer_mit-b3_512x512_160k_ade20k
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B3
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 45.23
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 4.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 47.82
-      mIoU(ms+flip): 48.81
-  Config: configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_512x512_160k_ade20k/segformer_mit-b3_512x512_160k_ade20k_20210726_081410-962b98d2.pth
-- Name: segformer_mit-b4_512x512_160k_ade20k
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B4
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 64.72
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 48.46
-      mIoU(ms+flip): 49.76
-  Config: configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_512x512_160k_ade20k/segformer_mit-b4_512x512_160k_ade20k_20210728_183055-7f509d7d.pth
-- Name: segformer_mit-b5_512x512_160k_ade20k
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B5
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 84.1
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 49.13
-      mIoU(ms+flip): 50.22
-  Config: configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_512x512_160k_ade20k/segformer_mit-b5_512x512_160k_ade20k_20210726_145235-94cedf59.pth
-- Name: segformer_mit-b5_640x640_160k_ade20k
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B5
-    crop size: (640,640)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 88.5
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (640,640)
-    Training Memory (GB): 11.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 49.62
-      mIoU(ms+flip): 50.36
-  Config: configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_640x640_160k_ade20k/segformer_mit-b5_640x640_160k_ade20k_20210801_121243-41d2845b.pth
-- Name: segformer_mit-b0_8x1_1024x1024_160k_cityscapes
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B0
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 210.97
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 3.64
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.54
-      mIoU(ms+flip): 78.22
-  Config: configs/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes/segformer_mit-b0_8x1_1024x1024_160k_cityscapes_20211208_101857-e7f88502.pth
-- Name: segformer_mit-b1_8x1_1024x1024_160k_cityscapes
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B1
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 232.56
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 4.49
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.56
-      mIoU(ms+flip): 79.73
-  Config: configs/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes/segformer_mit-b1_8x1_1024x1024_160k_cityscapes_20211208_064213-655c7b3f.pth
-- Name: segformer_mit-b2_8x1_1024x1024_160k_cityscapes
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B2
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 297.62
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 7.42
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 81.08
-      mIoU(ms+flip): 82.18
-  Config: configs/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes/segformer_mit-b2_8x1_1024x1024_160k_cityscapes_20211207_134205-6096669a.pth
-- Name: segformer_mit-b3_8x1_1024x1024_160k_cityscapes
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B3
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 395.26
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 10.86
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 81.94
-      mIoU(ms+flip): 83.14
-  Config: configs/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes/segformer_mit-b3_8x1_1024x1024_160k_cityscapes_20211206_224823-a8f8a177.pth
-- Name: segformer_mit-b4_8x1_1024x1024_160k_cityscapes
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B4
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 531.91
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 15.07
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 81.89
-      mIoU(ms+flip): 83.38
-  Config: configs/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes/segformer_mit-b4_8x1_1024x1024_160k_cityscapes_20211207_080709-07f6c333.pth
-- Name: segformer_mit-b5_8x1_1024x1024_160k_cityscapes
-  In Collection: Segformer
-  Metadata:
-    backbone: MIT-B5
-    crop size: (1024,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 719.42
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (1024,1024)
-    Training Memory (GB): 18.0
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 82.25
-      mIoU(ms+flip): 83.48
-  Config: configs/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes/segformer_mit-b5_8x1_1024x1024_160k_cityscapes_20211206_072934-87a052ec.pth
diff --git a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py
deleted file mode 100644
index 607e2848ee..0000000000
--- a/configs/segformer/segformer_mit-b0_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,38 +0,0 @@
-_base_ = [
-    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py',
-    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (512, 512)
-data_preprocessor = dict(size=crop_size)
-model = dict(
-    data_preprocessor=data_preprocessor,
-    pretrained='pretrain/mit_b0.pth',
-    decode_head=dict(num_classes=150))
-
-optim_wrapper = dict(
-    _delete_=True,
-    type='OptimWrapper',
-    optimizer=dict(
-        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
-    paramwise_cfg=dict(
-        custom_keys={
-            'pos_block': dict(decay_mult=0.),
-            'norm': dict(decay_mult=0.),
-            'head': dict(lr_mult=10.)
-        }))
-
-param_scheduler = [
-    dict(
-        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
-    dict(
-        type='PolyLR',
-        eta_min=0.0,
-        power=1.0,
-        begin=1500,
-        end=160000,
-        by_epoch=False,
-    )
-]
-train_dataloader = dict(batch_size=2, num_workers=2)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py b/configs/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py
deleted file mode 100644
index 9f6bd1e81d..0000000000
--- a/configs/segformer/segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,41 +0,0 @@
-_base_ = [
-    '../_base_/models/segformer_mit-b0.py',
-    '../_base_/datasets/cityscapes_1024x1024.py',
-    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
-]
-crop_size = (1024, 1024)
-data_preprocessor = dict(size=crop_size)
-model = dict(
-    data_preprocessor=data_preprocessor,
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint='pretrain/mit_b0.pth')),
-    test_cfg=dict(mode='slide', crop_size=(1024, 1024), stride=(768, 768)))
-
-optim_wrapper = dict(
-    _delete_=True,
-    type='OptimWrapper',
-    optimizer=dict(
-        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
-    paramwise_cfg=dict(
-        custom_keys={
-            'pos_block': dict(decay_mult=0.),
-            'norm': dict(decay_mult=0.),
-            'head': dict(lr_mult=10.)
-        }))
-
-param_scheduler = [
-    dict(
-        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
-    dict(
-        type='PolyLR',
-        eta_min=0.0,
-        power=1.0,
-        begin=1500,
-        end=160000,
-        by_epoch=False,
-    )
-]
-
-train_dataloader = dict(batch_size=1, num_workers=4)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py b/configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..1280047c94
--- /dev/null
+++ b/configs/segformer/segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,41 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py',
+    '../_base_/datasets/cityscapes_1024x1024.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (1024, 1024)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    test_cfg=dict(mode='slide', crop_size=(1024, 1024), stride=(768, 768)))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+
+train_dataloader = dict(batch_size=1, num_workers=4)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py b/configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..4a9476df68
--- /dev/null
+++ b/configs/segformer/segformer_mit-b0_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,39 @@
+_base_ = [
+    '../_base_/models/segformer_mit-b0.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b0_20220624-7e0fe6dd.pth'  # noqa
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=150))
+
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=160000,
+        by_epoch=False,
+    )
+]
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py
deleted file mode 100644
index 5fce602144..0000000000
--- a/configs/segformer/segformer_mit-b1_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py']
-
-# model settings
-model = dict(
-    pretrained='pretrain/mit_b1.pth',
-    backbone=dict(
-        embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[2, 2, 2, 2]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes.py b/configs/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes.py
deleted file mode 100644
index a93e33bd88..0000000000
--- a/configs/segformer/segformer_mit-b1_8x1_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,7 +0,0 @@
-_base_ = ['./segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py']
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint='pretrain/mit_b1.pth'),
-        embed_dims=64),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py b/configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..85c126ead4
--- /dev/null
+++ b/configs/segformer/segformer_mit-b1_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,9 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b1_20220624-02e5a6a1.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py b/configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..1ff21b8bec
--- /dev/null
+++ b/configs/segformer/segformer_mit-b1_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b1_20220624-02e5a6a1.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[2, 2, 2, 2]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py
deleted file mode 100644
index afb24b0170..0000000000
--- a/configs/segformer/segformer_mit-b2_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py']
-
-# model settings
-model = dict(
-    pretrained='pretrain/mit_b2.pth',
-    backbone=dict(
-        embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 4, 6, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes.py b/configs/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes.py
deleted file mode 100644
index fab6be2945..0000000000
--- a/configs/segformer/segformer_mit-b2_8x1_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py']
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint='pretrain/mit_b2.pth'),
-        embed_dims=64,
-        num_layers=[3, 4, 6, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py b/configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..c802f275b5
--- /dev/null
+++ b/configs/segformer/segformer_mit-b2_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,10 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py b/configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..0f4c1af061
--- /dev/null
+++ b/configs/segformer/segformer_mit-b2_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b2_20220624-66e8bf70.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 6, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py
deleted file mode 100644
index 52348f6fcc..0000000000
--- a/configs/segformer/segformer_mit-b3_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py']
-
-# model settings
-model = dict(
-    pretrained='pretrain/mit_b3.pth',
-    backbone=dict(
-        embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 4, 18, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes.py b/configs/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes.py
deleted file mode 100644
index 479ce04ea1..0000000000
--- a/configs/segformer/segformer_mit-b3_8x1_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py']
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint='pretrain/mit_b3.pth'),
-        embed_dims=64,
-        num_layers=[3, 4, 18, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py b/configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..9b41ad0b39
--- /dev/null
+++ b/configs/segformer/segformer_mit-b3_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,10 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b3_20220624-13b1141c.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_layers=[3, 4, 18, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py b/configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..a2cc13d847
--- /dev/null
+++ b/configs/segformer/segformer_mit-b3_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b3_20220624-13b1141c.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 4, 18, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py
deleted file mode 100644
index 7b50b75608..0000000000
--- a/configs/segformer/segformer_mit-b4_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py']
-
-# model settings
-model = dict(
-    pretrained='pretrain/mit_b4.pth',
-    backbone=dict(
-        embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 8, 27, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes.py b/configs/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes.py
deleted file mode 100644
index 808a1eb41b..0000000000
--- a/configs/segformer/segformer_mit-b4_8x1_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py']
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint='pretrain/mit_b4.pth'),
-        embed_dims=64,
-        num_layers=[3, 8, 27, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py b/configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..5fb16080dd
--- /dev/null
+++ b/configs/segformer/segformer_mit-b4_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,10 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_layers=[3, 8, 27, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py b/configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..5f39c30108
--- /dev/null
+++ b/configs/segformer/segformer_mit-b4_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b4_20220624-d588d980.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 8, 27, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py b/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py
deleted file mode 100644
index 5212fb1f6a..0000000000
--- a/configs/segformer/segformer_mit-b5_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py']
-
-# model settings
-model = dict(
-    pretrained='pretrain/mit_b5.pth',
-    backbone=dict(
-        embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 6, 40, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py b/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py
deleted file mode 100644
index 0d13707ec2..0000000000
--- a/configs/segformer/segformer_mit-b5_640x640_160k_ade20k.py
+++ /dev/null
@@ -1,37 +0,0 @@
-_base_ = ['./segformer_mit-b0_512x512_160k_ade20k.py']
-
-# dataset settings
-crop_size = (640, 640)
-data_preprocessor = dict(size=crop_size)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations', reduce_zero_label=True),
-    dict(
-        type='RandomResize',
-        scale=(2048, 640),
-        ratio_range=(0.5, 2.0),
-        keep_ratio=True),
-    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
-    dict(type='RandomFlip', prob=0.5),
-    dict(type='PhotoMetricDistortion'),
-    dict(type='PackSegInputs')
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='Resize', scale=(2048, 640), keep_ratio=True),
-    # add loading annotation after ``Resize`` because ground truth
-    # does not need to do resize data transform
-    dict(type='LoadAnnotations', reduce_zero_label=True),
-    dict(type='PackSegInputs')
-]
-train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
-val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
-test_dataloader = val_dataloader
-
-# model settings
-model = dict(
-    data_preprocessor=data_preprocessor,
-    pretrained='pretrain/mit_b5.pth',
-    backbone=dict(
-        embed_dims=64, num_heads=[1, 2, 5, 8], num_layers=[3, 6, 40, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes.py b/configs/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes.py
deleted file mode 100644
index 1c9422d37c..0000000000
--- a/configs/segformer/segformer_mit-b5_8x1_1024x1024_160k_cityscapes.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py']
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint='pretrain/mit_b5.pth'),
-        embed_dims=64,
-        num_layers=[3, 6, 40, 3]),
-    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py b/configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py
new file mode 100644
index 0000000000..18c3c16258
--- /dev/null
+++ b/configs/segformer/segformer_mit-b5_8xb1-160k_cityscapes-1024x1024.py
@@ -0,0 +1,10 @@
+_base_ = ['./segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_layers=[3, 6, 40, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py b/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..1e9a209ebe
--- /dev/null
+++ b/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'  # noqa
+
+# model settings
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 6, 40, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py b/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py
new file mode 100644
index 0000000000..a32eb7c1e1
--- /dev/null
+++ b/configs/segformer/segformer_mit-b5_8xb2-160k_ade20k-640x640.py
@@ -0,0 +1,41 @@
+_base_ = ['./segformer_mit-b0_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segformer/mit_b5_20220624-658746d9.pth'  # noqa
+
+# dataset settings
+crop_size = (640, 640)
+data_preprocessor = dict(size=crop_size)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 640), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(dataset=dict(pipeline=train_pipeline))
+val_dataloader = dict(batch_size=1, dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+# model settings
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=64,
+        num_heads=[1, 2, 5, 8],
+        num_layers=[3, 6, 40, 3]),
+    decode_head=dict(in_channels=[64, 128, 320, 512]))
diff --git a/configs/segmenter/README.md b/configs/segmenter/README.md
index caefe996e2..103b125472 100644
--- a/configs/segmenter/README.md
+++ b/configs/segmenter/README.md
@@ -1,6 +1,6 @@
 # Segmenter
 
-[Segmenter: Transformer for Semantic Segmentation](https://arxiv.org/abs/2105.05633)
+> [Segmenter: Transformer for Semantic Segmentation](https://arxiv.org/abs/2105.05633)
 
 ## Introduction
 
@@ -22,16 +22,6 @@ Image segmentation is often ambiguous at the level of individual image patches a
 <img src="https://user-images.githubusercontent.com/24582831/148507554-87eb80bd-02c7-4c31-b102-c6141e231ec8.png" width="70%"/>
 </div>
 
-```bibtex
-@inproceedings{strudel2021segmenter,
-  title={Segmenter: Transformer for semantic segmentation},
-  author={Strudel, Robin and Garcia, Ricardo and Laptev, Ivan and Schmid, Cordelia},
-  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
-  pages={7262--7272},
-  year={2021}
-}
-```
-
 ## Usage
 
 We have provided pretrained models converted from [ViT-AugReg](https://github.com/rwightman/pytorch-image-models/blob/f55c22bebf9d8afc449d317a723231ef72e0d662/timm/models/vision_transformer.py#L54-L106).
@@ -54,21 +44,33 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in
 
 In our default setting, pretrained models and their corresponding [ViT-AugReg](https://github.com/rwightman/pytorch-image-models/blob/f55c22bebf9d8afc449d317a723231ef72e0d662/timm/models/vision_transformer.py#L54-L106) models could be defined below:
 
-| pretrained models     | original models                                                                                                                                                                   |
-| --------------------- | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| vit_tiny_p16_384.pth  | ['vit_tiny_patch16_384'](https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)   |
-| vit_small_p16_384.pth | ['vit_small_patch16_384'](https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz) |
-| vit_base_p16_384.pth  | ['vit_base_patch16_384'](https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz)  |
-| vit_large_p16_384.pth | ['vit_large_patch16_384'](https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz) |
+| pretrained models     | original models                                                                                                                                                                 |
+| --------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| vit_tiny_p16_384.pth  | [vit_tiny_patch16_384](https://storage.googleapis.com/vit_models/augreg/Ti_16-i21k-300ep-lr_0.001-aug_none-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz)   |
+| vit_small_p16_384.pth | [vit_small_patch16_384](https://storage.googleapis.com/vit_models/augreg/S_16-i21k-300ep-lr_0.001-aug_light1-wd_0.03-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.03-res_384.npz) |
+| vit_base_p16_384.pth  | [vit_base_patch16_384](https://storage.googleapis.com/vit_models/augreg/B_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.0-sd_0.0--imagenet2012-steps_20k-lr_0.01-res_384.npz)  |
+| vit_large_p16_384.pth | [vit_large_patch16_384](https://storage.googleapis.com/vit_models/augreg/L_16-i21k-300ep-lr_0.001-aug_medium1-wd_0.1-do_0.1-sd_0.1--imagenet2012-steps_20k-lr_0.01-res_384.npz) |
 
 ## Results and models
 
 ### ADE20K
 
-| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                                 | download                                                                                                                                                                                                                                                                                                                                                                                       |
-| ---------------- | -------- | --------- | ------- | -------- | -------------- | ----- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Segmenter Mask   | ViT-T_16 | 512x512   | 160000  | 1.21     | 27.98          | 39.99 | 40.83         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706-ffcf7509.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
-| Segmenter Linear | ViT-S_16 | 512x512   | 160000  | 1.78     | 28.07          | 45.75 | 46.82         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713-39658c46.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713.log.json) |
-| Segmenter Mask   | ViT-S_16 | 512x512   | 160000  | 2.03     | 24.80          | 46.19 | 47.85         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706-511bb103.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
-| Segmenter Mask   | ViT-B_16 | 512x512   | 160000  | 4.20     | 13.20          | 49.60 | 51.07         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706-bc533b08.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
-| Segmenter Mask   | ViT-L_16 | 640x640   | 160000  | 16.56    | 2.62           | 52.16 | 53.65         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750-7ef345be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750.log.json)         |
+| Method           | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                       |
+| ---------------- | -------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Segmenter Mask   | ViT-T_16 | 512x512   | 160000  | 1.21     | 27.98          | V100   | 39.99 | 40.83         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706-ffcf7509.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
+| Segmenter Linear | ViT-S_16 | 512x512   | 160000  | 1.78     | 28.07          | V100   | 45.75 | 46.82         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713-39658c46.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713.log.json) |
+| Segmenter Mask   | ViT-S_16 | 512x512   | 160000  | 2.03     | 24.80          | V100   | 46.19 | 47.85         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706-511bb103.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
+| Segmenter Mask   | ViT-B_16 | 512x512   | 160000  | 4.20     | 13.20          | V100   | 49.60 | 51.07         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706-bc533b08.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json)         |
+| Segmenter Mask   | ViT-L_16 | 640x640   | 160000  | 16.56    | 2.62           | V100   | 52.16 | 53.65         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750-7ef345be.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750.log.json)         |
+
+## Citation
+
+```bibtex
+@inproceedings{strudel2021segmenter,
+  title={Segmenter: Transformer for semantic segmentation},
+  author={Strudel, Robin and Garcia, Ricardo and Laptev, Ivan and Schmid, Cordelia},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={7262--7272},
+  year={2021}
+}
+```
diff --git a/configs/segmenter/metafile.yaml b/configs/segmenter/metafile.yaml
new file mode 100644
index 0000000000..ff2aa448bb
--- /dev/null
+++ b/configs/segmenter/metafile.yaml
@@ -0,0 +1,138 @@
+Collections:
+- Name: Segmenter
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  README: configs/segmenter/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: segmenter_vit-t_mask_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.99
+      mIoU(ms+flip): 40.83
+  Config: configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-T_16
+    - Segmenter
+    - Mask
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 1.21
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706-ffcf7509.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
+- Name: segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.75
+      mIoU(ms+flip): 46.82
+  Config: configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-S_16
+    - Segmenter
+    - Linear
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 1.78
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713-39658c46.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
+- Name: segmenter_vit-s_mask_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.19
+      mIoU(ms+flip): 47.85
+  Config: configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-S_16
+    - Segmenter
+    - Mask
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 2.03
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706-511bb103.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
+- Name: segmenter_vit-b_mask_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.6
+      mIoU(ms+flip): 51.07
+  Config: configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-B_16
+    - Segmenter
+    - Mask
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706-bc533b08.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
+- Name: segmenter_vit-l_mask_8xb1-160k_ade20k-512x512
+  In Collection: Segmenter
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 52.16
+      mIoU(ms+flip): 53.65
+  Config: configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-L_16
+    - Segmenter
+    - Mask
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 16.56
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750-7ef345be.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750.log.json
+  Paper:
+    Title: 'Segmenter: Transformer for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2105.05633
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
+  Framework: PyTorch
diff --git a/configs/segmenter/segmenter.yml b/configs/segmenter/segmenter.yml
deleted file mode 100644
index dc6e68d3dd..0000000000
--- a/configs/segmenter/segmenter.yml
+++ /dev/null
@@ -1,125 +0,0 @@
-Collections:
-- Name: Segmenter
-  Metadata:
-    Training Data:
-    - ADE20K
-  Paper:
-    URL: https://arxiv.org/abs/2105.05633
-    Title: 'Segmenter: Transformer for Semantic Segmentation'
-  README: configs/segmenter/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.21.0/mmseg/models/decode_heads/segmenter_mask_head.py#L15
-    Version: v0.21.0
-  Converted From:
-    Code: https://github.com/rstrudel/segmenter
-Models:
-- Name: segmenter_vit-t_mask_8x1_512x512_160k_ade20k
-  In Collection: Segmenter
-  Metadata:
-    backbone: ViT-T_16
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 35.74
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.21
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 39.99
-      mIoU(ms+flip): 40.83
-  Config: configs/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k/segmenter_vit-t_mask_8x1_512x512_160k_ade20k_20220105_151706-ffcf7509.pth
-- Name: segmenter_vit-s_linear_8x1_512x512_160k_ade20k
-  In Collection: Segmenter
-  Metadata:
-    backbone: ViT-S_16
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 35.63
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 1.78
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.75
-      mIoU(ms+flip): 46.82
-  Config: configs/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k/segmenter_vit-s_linear_8x1_512x512_160k_ade20k_20220105_151713-39658c46.pth
-- Name: segmenter_vit-s_mask_8x1_512x512_160k_ade20k
-  In Collection: Segmenter
-  Metadata:
-    backbone: ViT-S_16
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 40.32
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 2.03
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.19
-      mIoU(ms+flip): 47.85
-  Config: configs/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k/segmenter_vit-s_mask_8x1_512x512_160k_ade20k_20220105_151706-511bb103.pth
-- Name: segmenter_vit-b_mask_8x1_512x512_160k_ade20k
-  In Collection: Segmenter
-  Metadata:
-    backbone: ViT-B_16
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 75.76
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 4.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 49.6
-      mIoU(ms+flip): 51.07
-  Config: configs/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k/segmenter_vit-b_mask_8x1_512x512_160k_ade20k_20220105_151706-bc533b08.pth
-- Name: segmenter_vit-l_mask_8x1_512x512_160k_ade20k
-  In Collection: Segmenter
-  Metadata:
-    backbone: ViT-L_16
-    crop size: (640,640)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 381.68
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (640,640)
-    Training Memory (GB): 16.56
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 52.16
-      mIoU(ms+flip): 53.65
-  Config: configs/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k/segmenter_vit-l_mask_8x1_512x512_160k_ade20k_20220105_162750-7ef345be.pth
diff --git a/configs/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k.py b/configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py
similarity index 100%
rename from configs/segmenter/segmenter_vit-b_mask_8x1_512x512_160k_ade20k.py
rename to configs/segmenter/segmenter_vit-b_mask_8xb1-160k_ade20k-512x512.py
diff --git a/configs/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k.py b/configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py
similarity index 100%
rename from configs/segmenter/segmenter_vit-l_mask_8x1_512x512_160k_ade20k.py
rename to configs/segmenter/segmenter_vit-l_mask_8xb1-160k_ade20k-512x512.py
diff --git a/configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py b/configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..dc1e4c8985
--- /dev/null
+++ b/configs/segmenter/segmenter_vit-s_fcn_8xb1-160k_ade20k-512x512.py
@@ -0,0 +1,14 @@
+_base_ = './segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py'
+
+model = dict(
+    decode_head=dict(
+        _delete_=True,
+        type='FCNHead',
+        in_channels=384,
+        channels=384,
+        num_convs=0,
+        dropout_ratio=0.0,
+        concat_input=False,
+        num_classes=150,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/configs/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k.py b/configs/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k.py
deleted file mode 100644
index adc8c1b283..0000000000
--- a/configs/segmenter/segmenter_vit-s_linear_8x1_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,14 +0,0 @@
-_base_ = './segmenter_vit-s_mask_8x1_512x512_160k_ade20k.py'
-
-model = dict(
-    decode_head=dict(
-        _delete_=True,
-        type='FCNHead',
-        in_channels=384,
-        channels=384,
-        num_convs=0,
-        dropout_ratio=0.0,
-        concat_input=False,
-        num_classes=150,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
diff --git a/configs/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k.py b/configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py
similarity index 100%
rename from configs/segmenter/segmenter_vit-s_mask_8x1_512x512_160k_ade20k.py
rename to configs/segmenter/segmenter_vit-s_mask_8xb1-160k_ade20k-512x512.py
diff --git a/configs/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k.py b/configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py
similarity index 100%
rename from configs/segmenter/segmenter_vit-t_mask_8x1_512x512_160k_ade20k.py
rename to configs/segmenter/segmenter_vit-t_mask_8xb1-160k_ade20k-512x512.py
diff --git a/configs/segnext/README.md b/configs/segnext/README.md
new file mode 100644
index 0000000000..d7434a0621
--- /dev/null
+++ b/configs/segnext/README.md
@@ -0,0 +1,63 @@
+# SegNeXt
+
+> [SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation](https://arxiv.org/abs/2209.08575)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/visual-attention-network/segnext">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+We present SegNeXt, a simple convolutional network architecture for semantic segmentation. Recent transformer-based models have dominated the field of semantic segmentation due to the efficiency of self-attention in encoding spatial information. In this paper, we show that convolutional attention is a more efficient and effective way to encode contextual information than the self-attention mechanism in transformers. By re-examining the characteristics owned by successful segmentation models, we discover several key components leading to the performance improvement of segmentation models. This motivates us to design a novel convolutional attention network that uses cheap convolutional operations. Without bells and whistles, our SegNeXt significantly improves the performance of previous state-of-the-art methods on popular benchmarks, including ADE20K, Cityscapes, COCO-Stuff, Pascal VOC, Pascal Context, and iSAID. Notably, SegNeXt outperforms EfficientNet-L2 w/ NAS-FPN and achieves 90.6% mIoU on the Pascal VOC 2012 test leaderboard using only 1/10 parameters of it. On average, SegNeXt achieves about 2.0% mIoU improvements compared to the state-of-the-art methods on the ADE20K datasets with the same or fewer computations. Code is available at [this https URL](https://github.com/uyzhang/JSeg) (Jittor) and [this https URL](https://github.com/Visual-Attention-Network/SegNeXt) (Pytorch).
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/215688018-5d4c8366-7793-4fdf-9397-960a09fac951.png" width="70%"/>
+</div>
+
+## Results and models
+
+### ADE20K
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                   |
+| ------- | -------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| SegNeXt | MSCAN-T  | 512x512   | 160000  | 17.88    | 52.38          | A100   | 41.50 | 42.59         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k_20230210_140244-05bd8466.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k_20230210_140244.log.json) |
+| SegNeXt | MSCAN-S  | 512x512   | 160000  | 21.47    | 42.27          | A100   | 44.16 | 45.81         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k_20230214_113014-43013668.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k_20230214_113014.log.json) |
+| SegNeXt | MSCAN-B  | 512x512   | 160000  | 31.03    | 35.15          | A100   | 48.03 | 49.68         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k_20230209_172053-b6f6c70c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k_20230209_172053.log.json) |
+| SegNeXt | MSCAN-L  | 512x512   | 160000  | 43.32    | 22.91          | A100   | 50.99 | 52.10         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k_20230209_172055-19b14b63.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k_20230209_172055.log.json) |
+
+Note:
+
+- When we integrated SegNeXt into MMSegmentation, we modified some layers' names to make them more precise and concise without changing the model architecture. Therefore, the keys of pre-trained weights are different from the [original weights](https://cloud.tsinghua.edu.cn/d/c15b25a6745946618462/), but don't worry about these changes. we have converted them and uploaded the checkpoints, you might find URL of pre-trained checkpoints in config files and can use them directly for training.
+
+- The total batch size is 16. We trained for SegNeXt with a single GPU as the performance degrades significantly when using`SyncBN` (mainly in `OverlapPatchEmbed` modules of `MSCAN`) of PyTorch 1.9.
+
+- There will be subtle differences when model testing as Non-negative Matrix Factorization (NMF) in `LightHamHead` will be initialized randomly. To control this randomness, please set the random seed when model testing. You can modify [`./tools/test.py`](https://github.com/open-mmlab/mmsegmentation/blob/main/tools/test.py) like:
+
+```python
+def main():
+    from mmengine.runner import seg_random_seed
+    random_seed = xxx # set random seed recorded in training log
+    set_random_seed(random_seed, deterministic=False)
+    ...
+```
+
+- This model performance is sensitive to the seed values used, please refer to the log file for the specific settings of the seed. If you choose a different seed, the results might differ from the table results. Take SegNeXt Large for example, its results range from 49.60 to 51.0.
+
+## Citation
+
+```bibtex
+@article{guo2022segnext,
+  title={SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation},
+  author={Guo, Meng-Hao and Lu, Cheng-Ze and Hou, Qibin and Liu, Zhengning and Cheng, Ming-Ming and Hu, Shi-Min},
+  journal={arXiv preprint arXiv:2209.08575},
+  year={2022}
+}
+```
diff --git a/configs/segnext/metafile.yaml b/configs/segnext/metafile.yaml
new file mode 100644
index 0000000000..3c8ff5bb92
--- /dev/null
+++ b/configs/segnext/metafile.yaml
@@ -0,0 +1,109 @@
+Collections:
+- Name: SegNeXt
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  README: configs/segnext/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512
+  In Collection: SegNeXt
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 41.5
+      mIoU(ms+flip): 42.59
+  Config: configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MSCAN-T
+    - SegNeXt
+    Training Resources: 1x A100 GPUS
+    Memory (GB): 17.88
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k_20230210_140244-05bd8466.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k/segnext_mscan-t_1x16_512x512_adamw_160k_ade20k_20230210_140244.log.json
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328
+  Framework: PyTorch
+- Name: segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512
+  In Collection: SegNeXt
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.16
+      mIoU(ms+flip): 45.81
+  Config: configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MSCAN-S
+    - SegNeXt
+    Training Resources: 1x A100 GPUS
+    Memory (GB): 21.47
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k_20230214_113014-43013668.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k/segnext_mscan-s_1x16_512x512_adamw_160k_ade20k_20230214_113014.log.json
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328
+  Framework: PyTorch
+- Name: segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512
+  In Collection: SegNeXt
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.03
+      mIoU(ms+flip): 49.68
+  Config: configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MSCAN-B
+    - SegNeXt
+    Training Resources: 1x A100 GPUS
+    Memory (GB): 31.03
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k_20230209_172053-b6f6c70c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k/segnext_mscan-b_1x16_512x512_adamw_160k_ade20k_20230209_172053.log.json
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328
+  Framework: PyTorch
+- Name: segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512
+  In Collection: SegNeXt
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 50.99
+      mIoU(ms+flip): 52.1
+  Config: configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - MSCAN-L
+    - SegNeXt
+    Training Resources: 1x A100 GPUS
+    Memory (GB): 43.32
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k_20230209_172055-19b14b63.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/segnext/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k/segnext_mscan-l_1x16_512x512_adamw_160k_ade20k_20230209_172055.log.json
+  Paper:
+    Title: 'SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation'
+    URL: https://arxiv.org/abs/2209.08575
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/backbones/mscan.py#L328
+  Framework: PyTorch
diff --git a/configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py b/configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..000f448483
--- /dev/null
+++ b/configs/segnext/segnext_mscan-b_1xb16-adamw-160k_ade20k-512x512.py
@@ -0,0 +1,28 @@
+_base_ = './segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py'
+
+# model settings
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segnext/mscan_b_20230227-3ab7d230.pth'  # noqa
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[3, 3, 12, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        drop_path_rate=0.1,
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    decode_head=dict(
+        type='LightHamHead',
+        in_channels=[128, 320, 512],
+        in_index=[1, 2, 3],
+        channels=512,
+        ham_channels=512,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py b/configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..212d0a8557
--- /dev/null
+++ b/configs/segnext/segnext_mscan-l_1xb16-adamw-160k_ade20k-512x512.py
@@ -0,0 +1,27 @@
+_base_ = './segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py'
+# model settings
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segnext/mscan_l_20230227-cef260d4.pth'  # noqa
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[3, 5, 27, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        drop_path_rate=0.3,
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    decode_head=dict(
+        type='LightHamHead',
+        in_channels=[128, 320, 512],
+        in_index=[1, 2, 3],
+        channels=1024,
+        ham_channels=1024,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py b/configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..9a90779a60
--- /dev/null
+++ b/configs/segnext/segnext_mscan-s_1xb16-adamw-160k_ade20k-512x512.py
@@ -0,0 +1,27 @@
+_base_ = './segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py'
+# model settings
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segnext/mscan_s_20230227-f33ccdf2.pth'  # noqa
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[2, 2, 4, 2],
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    decode_head=dict(
+        type='LightHamHead',
+        in_channels=[128, 320, 512],
+        in_index=[1, 2, 3],
+        channels=256,
+        ham_channels=256,
+        ham_kwargs=dict(MD_R=16),
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py b/configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..c8d6da85ff
--- /dev/null
+++ b/configs/segnext/segnext_mscan-t_1xb16-adamw-160k_ade20k-512x512.py
@@ -0,0 +1,84 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py',
+    '../_base_/datasets/ade20k.py'
+]
+# model settings
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/segnext/mscan_t_20230227-119e8c9f.pth'  # noqa
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+crop_size = (512, 512)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=(512, 512),
+    test_cfg=dict(size_divisor=32))
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='MSCAN',
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        embed_dims=[32, 64, 160, 256],
+        mlp_ratios=[8, 8, 4, 4],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        depths=[3, 3, 5, 2],
+        attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+        attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+        act_cfg=dict(type='GELU'),
+        norm_cfg=dict(type='BN', requires_grad=True)),
+    decode_head=dict(
+        type='LightHamHead',
+        in_channels=[64, 160, 256],
+        in_index=[1, 2, 3],
+        channels=256,
+        ham_channels=256,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        ham_kwargs=dict(
+            MD_S=1,
+            MD_R=16,
+            train_steps=6,
+            eval_steps=7,
+            inv_t=100,
+            rand_init=True)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
+
+# dataset settings
+train_dataloader = dict(batch_size=16)
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_block': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.),
+            'head': dict(lr_mult=10.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
diff --git a/configs/sem_fpn/README.md b/configs/sem_fpn/README.md
index 054d5db4ac..697cf506e2 100644
--- a/configs/sem_fpn/README.md
+++ b/configs/sem_fpn/README.md
@@ -1,6 +1,6 @@
 # Semantic FPN
 
-[Panoptic Feature Pyramid Networks](https://arxiv.org/abs/1901.02446)
+> [Panoptic Feature Pyramid Networks](https://arxiv.org/abs/1901.02446)
 
 ## Introduction
 
@@ -22,6 +22,22 @@ The recently introduced panoptic segmentation task has renewed our community's i
 <img src="https://user-images.githubusercontent.com/24582831/142902694-03ed2131-9104-467b-ace1-c74c62fb7177.png" width="60%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FPN    | R-50     | 512x1024  |   80000 |      2.8 | 13.54          | V100   | 74.52 | 76.08         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes_20200717_021437-94018a0d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes-20200717_021437.log.json)     |
+| FPN    | R-101    | 512x1024  |   80000 |      3.9 | 10.29          | V100   | 75.80 | 77.40         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes_20200717_012416-c5800d4c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes-20200717_012416.log.json) |
+
+### ADE20K
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------: | -------------- | ------ | ----: | ------------- | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| FPN    | R-50     | 512x512   |  160000 |      4.9 | 55.77          | V100   | 37.49 | 39.09         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k_20200718_131734-5b5a6ab9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k-20200718_131734.log.json)     |
+| FPN    | R-101    | 512x512   |  160000 |      5.9 | 40.58          | V100   | 39.35 | 40.72         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k_20200718_131734-306b5004.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k-20200718_131734.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,19 +49,3 @@ The recently introduced panoptic segmentation task has renewed our community's i
   year={2019}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ---------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| FPN    | R-50     | 512x1024  |   80000 |      2.8 | 13.54          | 74.52 | 76.08         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/sem_fpn/fpn_r50_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes_20200717_021437-94018a0d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes-20200717_021437.log.json)     |
-| FPN    | R-101    | 512x1024  |   80000 |      3.9 | 10.29          | 75.80 | 77.40         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/sem_fpn/fpn_r101_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes_20200717_012416-c5800d4c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes-20200717_012416.log.json) |
-
-### ADE20K
-
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                             | download                                                                                                                                                                                                                                                                                                           |
-| ------ | -------- | --------- | ------: | -------: | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| FPN    | R-50     | 512x512   |  160000 |      4.9 | 55.77          | 37.49 | 39.09         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/sem_fpn/fpn_r50_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k_20200718_131734-5b5a6ab9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k-20200718_131734.log.json)     |
-| FPN    | R-101    | 512x512   |  160000 |      5.9 | 40.58          | 39.35 | 40.72         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/sem_fpn/fpn_r101_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k_20200718_131734-306b5004.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k-20200718_131734.log.json) |
diff --git a/configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py b/configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..1e9bcfbb59
--- /dev/null
+++ b/configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './fpn_r50_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py b/configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..adad1a4f38
--- /dev/null
+++ b/configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,5 @@
+_base_ = './fpn_r50_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/configs/sem_fpn/fpn_r101_512x1024_80k_cityscapes.py b/configs/sem_fpn/fpn_r101_512x1024_80k_cityscapes.py
deleted file mode 100644
index 7f8710d4be..0000000000
--- a/configs/sem_fpn/fpn_r101_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './fpn_r50_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/sem_fpn/fpn_r101_512x512_160k_ade20k.py b/configs/sem_fpn/fpn_r101_512x512_160k_ade20k.py
deleted file mode 100644
index a8b51eb108..0000000000
--- a/configs/sem_fpn/fpn_r101_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,5 +0,0 @@
-_base_ = './fpn_r50_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
-crop_size = (512, 512)
-data_preprocessor = dict(size=crop_size)
-model = dict(data_preprocessor=data_preprocessor)
diff --git a/configs/sem_fpn/fpn_r50_512x1024_80k_cityscapes.py b/configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/sem_fpn/fpn_r50_512x1024_80k_cityscapes.py
rename to configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/sem_fpn/fpn_r50_512x512_160k_ade20k.py b/configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/sem_fpn/fpn_r50_512x512_160k_ade20k.py
rename to configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py
diff --git a/configs/sem_fpn/metafile.yaml b/configs/sem_fpn/metafile.yaml
new file mode 100644
index 0000000000..e734897245
--- /dev/null
+++ b/configs/sem_fpn/metafile.yaml
@@ -0,0 +1,110 @@
+Collections:
+- Name: FPN
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  README: configs/sem_fpn/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: fpn_r50_4xb2-80k_cityscapes-512x1024
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.52
+      mIoU(ms+flip): 76.08
+  Config: configs/sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - FPN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes_20200717_021437-94018a0d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes-20200717_021437.log.json
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
+  Framework: PyTorch
+- Name: fpn_r101_4xb2-80k_cityscapes-512x1024
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 75.8
+      mIoU(ms+flip): 77.4
+  Config: configs/sem_fpn/fpn_r101_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - FPN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 3.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes_20200717_012416-c5800d4c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes-20200717_012416.log.json
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
+  Framework: PyTorch
+- Name: fpn_r50_4xb4-160k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 37.49
+      mIoU(ms+flip): 39.09
+  Config: configs/sem_fpn/fpn_r50_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - FPN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 4.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k_20200718_131734-5b5a6ab9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k-20200718_131734.log.json
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
+  Framework: PyTorch
+- Name: fpn_r101_4xb4-160k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 39.35
+      mIoU(ms+flip): 40.72
+  Config: configs/sem_fpn/fpn_r101_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - FPN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 5.9
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k_20200718_131734-306b5004.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k-20200718_131734.log.json
+  Paper:
+    Title: Panoptic Feature Pyramid Networks
+    URL: https://arxiv.org/abs/1901.02446
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
+  Framework: PyTorch
diff --git a/configs/sem_fpn/sem_fpn.yml b/configs/sem_fpn/sem_fpn.yml
deleted file mode 100644
index d7ebdfe6fe..0000000000
--- a/configs/sem_fpn/sem_fpn.yml
+++ /dev/null
@@ -1,104 +0,0 @@
-Collections:
-- Name: FPN
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-  Paper:
-    URL: https://arxiv.org/abs/1901.02446
-    Title: Panoptic Feature Pyramid Networks
-  README: configs/sem_fpn/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/fpn_head.py#L12
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/facebookresearch/detectron2
-Models:
-- Name: fpn_r50_512x1024_80k_cityscapes
-  In Collection: FPN
-  Metadata:
-    backbone: R-50
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 73.86
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 2.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.52
-      mIoU(ms+flip): 76.08
-  Config: configs/sem_fpn/fpn_r50_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x1024_80k_cityscapes/fpn_r50_512x1024_80k_cityscapes_20200717_021437-94018a0d.pth
-- Name: fpn_r101_512x1024_80k_cityscapes
-  In Collection: FPN
-  Metadata:
-    backbone: R-101
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 97.18
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 3.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 75.8
-      mIoU(ms+flip): 77.4
-  Config: configs/sem_fpn/fpn_r101_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x1024_80k_cityscapes/fpn_r101_512x1024_80k_cityscapes_20200717_012416-c5800d4c.pth
-- Name: fpn_r50_512x512_160k_ade20k
-  In Collection: FPN
-  Metadata:
-    backbone: R-50
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 17.93
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 4.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 37.49
-      mIoU(ms+flip): 39.09
-  Config: configs/sem_fpn/fpn_r50_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r50_512x512_160k_ade20k/fpn_r50_512x512_160k_ade20k_20200718_131734-5b5a6ab9.pth
-- Name: fpn_r101_512x512_160k_ade20k
-  In Collection: FPN
-  Metadata:
-    backbone: R-101
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 24.64
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.9
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 39.35
-      mIoU(ms+flip): 40.72
-  Config: configs/sem_fpn/fpn_r101_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/sem_fpn/fpn_r101_512x512_160k_ade20k/fpn_r101_512x512_160k_ade20k_20200718_131734-306b5004.pth
diff --git a/configs/setr/README.md b/configs/setr/README.md
index 5afd2740a0..15be6ec099 100644
--- a/configs/setr/README.md
+++ b/configs/setr/README.md
@@ -1,6 +1,6 @@
 # SETR
 
-[Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers](https://arxiv.org/abs/2012.15840)
+> [Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers](https://arxiv.org/abs/2012.15840)
 
 ## Introduction
 
@@ -26,17 +26,6 @@ Most recent semantic segmentation methods adopt a fully-convolutional network (F
 This head has two version head.
 ```
 
-## Citation
-
-```bibtex
-@article{zheng2020rethinking,
-  title={Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers},
-  author={Zheng, Sixiao and Lu, Jiachen and Zhao, Hengshuang and Zhu, Xiatian and Luo, Zekun and Wang, Yabiao and Fu, Yanwei and Feng, Jianfeng and Xiang, Tao and Torr, Philip HS and others},
-  journal={arXiv preprint arXiv:2012.15840},
-  year={2020}
-}
-```
-
 ## Usage
 
 You can download the pretrain from [here](https://github.com/rwightman/pytorch-image-models/releases/download/v0.1-vitjx/jx_vit_large_p16_384-b3be5167.pth). Then you can convert its keys with the script `vit2mmseg.py` in the tools directory.
@@ -58,17 +47,28 @@ This script convert the model from `PRETRAIN_PATH` and store the converted model
 
 ### ADE20K
 
-| Method     | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                | download                                                                                                                                                                                                                                                                                                                             |
-| ---------- | -------- | --------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | --------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| SETR Naive | ViT-L    | 512x512   | 16         | 160000  | 18.40    | 4.72           | 48.28 |         49.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_naive_512x512_160k_b16_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258-061f24f5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258.log.json) |
-| SETR PUP   | ViT-L    | 512x512   | 16         | 160000  | 19.54    | 4.50           | 48.24 |         49.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_pup_512x512_160k_b16_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343.log.json)         |
-| SETR MLA   | ViT-L    | 512x512   | 8          | 160000  | 10.96    | -              | 47.34 |         49.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_mla_512x512_160k_b8_ade20k.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118.log.json)             |
-| SETR MLA   | ViT-L    | 512x512   | 16         | 160000  | 17.30    | 5.25           | 47.54 |         49.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_mla_512x512_160k_b16_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057.log.json)         |
+| Method     | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                             |
+| ---------- | -------- | --------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| SETR Naive | ViT-L    | 512x512   | 16         | 160000  | 18.40    | 4.72           | V100   | 48.28 |         49.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258-061f24f5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258.log.json) |
+| SETR PUP   | ViT-L    | 512x512   | 16         | 160000  | 19.54    | 4.50           | V100   | 48.24 |         49.99 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343.log.json)         |
+| SETR MLA   | ViT-L    | 512x512   | 8          | 160000  | 10.96    | -              | V100   | 47.34 |         49.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118.log.json)             |
+| SETR MLA   | ViT-L    | 512x512   | 16         | 160000  | 17.30    | 5.25           | V100   | 47.39 |         49.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057.log.json)         |
 
 ### Cityscapes
 
-| Method     | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                             | download                                                                                                                                                                                                                                                                                                                                                                                 |
-| ---------- | -------- | --------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | ---------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| SETR Naive | ViT-L    | 768x768   | 8          | 80000   | 24.06    | 0.39           | 78.10 |         80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505.log.json) |
-| SETR PUP   | ViT-L    | 768x768   | 8          | 80000   | 27.96    | 0.37           | 79.21 |         81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115.log.json)         |
-| SETR MLA   | ViT-L    | 768x768   | 8          | 80000   | 24.10    | 0.41           | 77.00 |         79.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003.log.json)         |
+| Method     | Backbone | Crop Size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                                 |
+| ---------- | -------- | --------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| SETR Naive | ViT-L    | 768x768   | 8          | 80000   | 24.06    | 0.39           | V100   | 78.10 |         80.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505.log.json) |
+| SETR PUP   | ViT-L    | 768x768   | 8          | 80000   | 27.96    | 0.37           | V100   | 79.21 |         81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115.log.json)         |
+| SETR MLA   | ViT-L    | 768x768   | 8          | 80000   | 24.10    | 0.41           | V100   | 77.00 |         79.59 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003.log.json)         |
+
+## Citation
+
+```bibtex
+@article{zheng2020rethinking,
+  title={Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective with Transformers},
+  author={Zheng, Sixiao and Lu, Jiachen and Zhao, Hengshuang and Zhu, Xiatian and Luo, Zekun and Wang, Yabiao and Fu, Yanwei and Feng, Jianfeng and Xiang, Tao and Torr, Philip HS and others},
+  journal={arXiv preprint arXiv:2012.15840},
+  year={2020}
+}
+```
diff --git a/configs/setr/metafile.yaml b/configs/setr/metafile.yaml
new file mode 100644
index 0000000000..8e6bc087dd
--- /dev/null
+++ b/configs/setr/metafile.yaml
@@ -0,0 +1,197 @@
+Collections:
+- Name: SETR
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - ADE20K
+    - Cityscapes
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  README: configs/setr/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: setr_vit-l_naive_8xb2-160k_ade20k-512x512
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.28
+      mIoU(ms+flip): 49.56
+  Config: configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-L
+    - SETR
+    - Naive
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 18.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258-061f24f5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_pup_8xb2-160k_ade20k-512x512
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.24
+      mIoU(ms+flip): 49.99
+  Config: configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-L
+    - SETR
+    - PUP
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 19.54
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l-mla_8xb1-160k_ade20k-512x512
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.34
+      mIoU(ms+flip): 49.05
+  Config: configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 8
+    Architecture:
+    - ViT-L
+    - SETR
+    - MLA
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 10.96
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_mla_8xb2-160k_ade20k-512x512
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.39
+      mIoU(ms+flip): 49.37
+  Config: configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-L
+    - SETR
+    - MLA
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 17.3
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_naive_8xb1-80k_cityscapes-768x768
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.1
+      mIoU(ms+flip): 80.22
+  Config: configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - ViT-L
+    - SETR
+    - Naive
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 24.06
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_pup_8xb1-80k_cityscapes-768x768
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.21
+      mIoU(ms+flip): 81.02
+  Config: configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - ViT-L
+    - SETR
+    - PUP
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 27.96
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
+- Name: setr_vit-l_mla_8xb1-80k_cityscapes-768x768
+  In Collection: SETR
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.0
+      mIoU(ms+flip): 79.59
+  Config: configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - ViT-L
+    - SETR
+    - MLA
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 24.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003.log.json
+  Paper:
+    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
+      with Transformers
+    URL: https://arxiv.org/abs/2012.15840
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
+  Framework: PyTorch
diff --git a/configs/setr/setr.yml b/configs/setr/setr.yml
deleted file mode 100644
index 27f58e48b0..0000000000
--- a/configs/setr/setr.yml
+++ /dev/null
@@ -1,164 +0,0 @@
-Collections:
-- Name: SETR
-  Metadata:
-    Training Data:
-    - ADE20K
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/2012.15840
-    Title: Rethinking Semantic Segmentation from a Sequence-to-Sequence Perspective
-      with Transformers
-  README: configs/setr/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/setr_up_head.py#L11
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/fudan-zvg/SETR
-Models:
-- Name: setr_naive_512x512_160k_b16_ade20k
-  In Collection: SETR
-  Metadata:
-    backbone: ViT-L
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 211.86
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 18.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 48.28
-      mIoU(ms+flip): 49.56
-  Config: configs/setr/setr_naive_512x512_160k_b16_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_512x512_160k_b16_ade20k/setr_naive_512x512_160k_b16_ade20k_20210619_191258-061f24f5.pth
-- Name: setr_pup_512x512_160k_b16_ade20k
-  In Collection: SETR
-  Metadata:
-    backbone: ViT-L
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 222.22
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 19.54
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 48.24
-      mIoU(ms+flip): 49.99
-  Config: configs/setr/setr_pup_512x512_160k_b16_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_512x512_160k_b16_ade20k/setr_pup_512x512_160k_b16_ade20k_20210619_191343-7e0ce826.pth
-- Name: setr_mla_512x512_160k_b8_ade20k
-  In Collection: SETR
-  Metadata:
-    backbone: ViT-L
-    crop size: (512,512)
-    lr schd: 160000
-    Training Memory (GB): 10.96
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 47.34
-      mIoU(ms+flip): 49.05
-  Config: configs/setr/setr_mla_512x512_160k_b8_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b8_ade20k/setr_mla_512x512_160k_b8_ade20k_20210619_191118-c6d21df0.pth
-- Name: setr_mla_512x512_160k_b16_ade20k
-  In Collection: SETR
-  Metadata:
-    backbone: ViT-L
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 190.48
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 17.3
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 47.54
-      mIoU(ms+flip): 49.37
-  Config: configs/setr/setr_mla_512x512_160k_b16_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_512x512_160k_b16_ade20k/setr_mla_512x512_160k_b16_ade20k_20210619_191057-f9741de7.pth
-- Name: setr_vit-large_naive_8x1_768x768_80k_cityscapes
-  In Collection: SETR
-  Metadata:
-    backbone: ViT-L
-    crop size: (768,768)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 2564.1
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (768,768)
-    Training Memory (GB): 24.06
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.1
-      mIoU(ms+flip): 80.22
-  Config: configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_naive_vit-large_8x1_768x768_80k_cityscapes/setr_naive_vit-large_8x1_768x768_80k_cityscapes_20211123_000505-20728e80.pth
-- Name: setr_vit-large_pup_8x1_768x768_80k_cityscapes
-  In Collection: SETR
-  Metadata:
-    backbone: ViT-L
-    crop size: (768,768)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 2702.7
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (768,768)
-    Training Memory (GB): 27.96
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.21
-      mIoU(ms+flip): 81.02
-  Config: configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_pup_vit-large_8x1_768x768_80k_cityscapes/setr_pup_vit-large_8x1_768x768_80k_cityscapes_20211122_155115-f6f37b8f.pth
-- Name: setr_vit-large_mla_8x1_768x768_80k_cityscapes
-  In Collection: SETR
-  Metadata:
-    backbone: ViT-L
-    crop size: (768,768)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 2439.02
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (768,768)
-    Training Memory (GB): 24.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.0
-      mIoU(ms+flip): 79.59
-  Config: configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/setr/setr_mla_vit-large_8x1_768x768_80k_cityscapes/setr_mla_vit-large_8x1_768x768_80k_cityscapes_20211119_101003-7f8dccbe.pth
diff --git a/configs/setr/setr_mla_512x512_160k_b16_ade20k.py b/configs/setr/setr_mla_512x512_160k_b16_ade20k.py
deleted file mode 100644
index 710e1ec364..0000000000
--- a/configs/setr/setr_mla_512x512_160k_b16_ade20k.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = ['./setr_mla_512x512_160k_b8_ade20k.py']
-
-# num_gpus: 8 -> batch_size: 16
-train_dataloader = dict(batch_size=2)
-val_dataloader = dict(batch_size=1)
-test_dataloader = val_dataloader
diff --git a/configs/setr/setr_mla_512x512_160k_b8_ade20k.py b/configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py
similarity index 100%
rename from configs/setr/setr_mla_512x512_160k_b8_ade20k.py
rename to configs/setr/setr_vit-l-mla_8xb1-160k_ade20k-512x512.py
diff --git a/configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py b/configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py
similarity index 100%
rename from configs/setr/setr_vit-large_mla_8x1_768x768_80k_cityscapes.py
rename to configs/setr/setr_vit-l_mla_8xb1-80k_cityscapes-768x768.py
diff --git a/configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py b/configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..4d3fb7d4e1
--- /dev/null
+++ b/configs/setr/setr_vit-l_mla_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = ['./setr_vit-l-mla_8xb1-160k_ade20k-512x512.py']
+
+# num_gpus: 8 -> batch_size: 16
+train_dataloader = dict(batch_size=2)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py b/configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py
similarity index 100%
rename from configs/setr/setr_vit-large_naive_8x1_768x768_80k_cityscapes.py
rename to configs/setr/setr_vit-l_naive_8xb1-80k_cityscapes-768x768.py
diff --git a/configs/setr/setr_naive_512x512_160k_b16_ade20k.py b/configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py
similarity index 100%
rename from configs/setr/setr_naive_512x512_160k_b16_ade20k.py
rename to configs/setr/setr_vit-l_naive_8xb2-160k_ade20k-512x512.py
diff --git a/configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py b/configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py
similarity index 100%
rename from configs/setr/setr_vit-large_pup_8x1_768x768_80k_cityscapes.py
rename to configs/setr/setr_vit-l_pup_8xb1-80k_cityscapes-768x768.py
diff --git a/configs/setr/setr_pup_512x512_160k_b16_ade20k.py b/configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py
similarity index 100%
rename from configs/setr/setr_pup_512x512_160k_b16_ade20k.py
rename to configs/setr/setr_vit-l_pup_8xb2-160k_ade20k-512x512.py
diff --git a/configs/stdc/README.md b/configs/stdc/README.md
index 1c6d70a252..3e8bf60688 100644
--- a/configs/stdc/README.md
+++ b/configs/stdc/README.md
@@ -1,6 +1,6 @@
 # STDC
 
-[Rethinking BiSeNet For Real-time Semantic Segmentation](https://arxiv.org/abs/2104.13188)
+> [Rethinking BiSeNet For Real-time Semantic Segmentation](https://arxiv.org/abs/2104.13188)
 
 ## Introduction
 
@@ -22,18 +22,6 @@ BiSeNet has been proved to be a popular two-stream network for real-time segment
 <img src="https://user-images.githubusercontent.com/24582831/143640374-d0709587-edb2-4821-bb60-340035f6ad8f.png" width="60%"/>
 </div>
 
-## Citation
-
-```bibtex
-@inproceedings{fan2021rethinking,
-  title={Rethinking BiSeNet For Real-time Semantic Segmentation},
-  author={Fan, Mingyuan and Lai, Shenqi and Huang, Junshi and Wei, Xiaoming and Chai, Zhenhua and Luo, Junfeng and Wei, Xiaolin},
-  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
-  pages={9716--9725},
-  year={2021}
-}
-```
-
 ## Usage
 
 We have provided [ImageNet Pretrained STDCNet Weights](https://drive.google.com/drive/folders/1wROFwRt8qWHD4jSo8Zu1gp1d6oYJ3ns1) models converted from [official repo](https://github.com/MichaelFan01/STDC-Seg).
@@ -58,12 +46,12 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in
 
 ### Cityscapes
 
-| Method               | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                                             |
-| -------------------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| STDC 1 (No Pretrain) | STDC1    | 512x1024  |   80000 | 7.15     | 23.06          | 71.82 | 73.89         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/stdc/stdc1_512x1024_80k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048-74e6920a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048.log.json)                                     |
-| STDC 1               | STDC1    | 512x1024  |   80000 | -        | -              | 74.94 | 76.97         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648-3d4c2981.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648.log.json) |
-| STDC 2 (No Pretrain) | STDC2    | 512x1024  |   80000 | 8.27     | 23.71          | 73.15 | 76.13         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/stdc/stdc2_512x1024_80k_cityscapes.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015-fb1e3a1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015.log.json)                                     |
-| STDC 2               | STDC2    | 512x1024  |   80000 | -        | -              | 76.67 | 78.67         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048-1f8f0f6c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048.log.json) |
+| Method | Backbone             | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                             |
+| ------ | -------------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------- | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| STDC   | STDC1 (No Pretrain)  | 512x1024  |   80000 | 7.15     | 23.06          | V100   | 71.82 | 73.89         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048-74e6920a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048.log.json)                                     |
+| STDC   | STDC1                | 512x1024  |   80000 | -        | -              | V100   | 74.94 | 76.97         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648-3d4c2981.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648.log.json) |
+| STDC   | STDC2  (No Pretrain) | 512x1024  |   80000 | 8.27     | 23.71          | V100   | 73.15 | 76.13         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015-fb1e3a1a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015.log.json)                                     |
+| STDC   | STDC2                | 512x1024  |   80000 | -        | -              | V100   | 76.67 | 78.67         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048-1f8f0f6c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048.log.json) |
 
 Note:
 
@@ -71,3 +59,15 @@ Note:
 - `No Pretrain` means the model is trained from scratch.
 - The FPS is for reference only. The environment is also different from paper setting, whose input size is `512x1024` and `768x1536`, i.e., 50% and 75% of our input size, respectively and using TensorRT.
 - The parameter `fusion_kernel` in `STDCHead` is not learnable. In official repo, `find_unused_parameters=True` is set [here](https://github.com/MichaelFan01/STDC-Seg/blob/59ff37fbd693b99972c76fcefe97caa14aeb619f/train.py#L220). You may check it by printing model parameters of original repo on your own.
+
+## Citation
+
+```bibtex
+@inproceedings{fan2021rethinking,
+  title={Rethinking BiSeNet For Real-time Semantic Segmentation},
+  author={Fan, Mingyuan and Lai, Shenqi and Huang, Junshi and Wei, Xiaoming and Chai, Zhenhua and Luo, Junfeng and Wei, Xiaolin},
+  booktitle={Proceedings of the IEEE/CVF Conference on Computer Vision and Pattern Recognition},
+  pages={9716--9725},
+  year={2021}
+}
+```
diff --git a/configs/stdc/metafile.yaml b/configs/stdc/metafile.yaml
new file mode 100644
index 0000000000..93cb14f50b
--- /dev/null
+++ b/configs/stdc/metafile.yaml
@@ -0,0 +1,107 @@
+Collections:
+- Name: STDC
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  README: configs/stdc/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: stdc1_4xb12-80k_cityscapes-512x1024
+  In Collection: STDC
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 71.82
+      mIoU(ms+flip): 73.89
+  Config: configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 48
+    Architecture:
+    - STDC1
+    - STDC
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.15
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048-74e6920a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048.log.json
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
+  Framework: PyTorch
+- Name: stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024
+  In Collection: STDC
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 74.94
+      mIoU(ms+flip): 76.97
+  Config: configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 48
+    Architecture:
+    - STDC1
+    - STDC
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648-3d4c2981.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648.log.json
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
+  Framework: PyTorch
+- Name: stdc2_4xb12-80k_cityscapes-512x1024
+  In Collection: STDC
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 73.15
+      mIoU(ms+flip): 76.13
+  Config: configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 48
+    Architecture:
+    - STDC2
+    - STDC
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.27
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015-fb1e3a1a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015.log.json
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
+  Framework: PyTorch
+- Name: stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024
+  In Collection: STDC
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 76.67
+      mIoU(ms+flip): 78.67
+  Config: configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 48
+    Architecture:
+    - STDC2
+    - STDC
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048-1f8f0f6c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048.log.json
+  Paper:
+    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
+    URL: https://arxiv.org/abs/2104.13188
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
+  Framework: PyTorch
diff --git a/configs/stdc/stdc.yml b/configs/stdc/stdc.yml
deleted file mode 100644
index f584b74bca..0000000000
--- a/configs/stdc/stdc.yml
+++ /dev/null
@@ -1,87 +0,0 @@
-Collections:
-- Name: STDC
-  Metadata:
-    Training Data:
-    - Cityscapes
-  Paper:
-    URL: https://arxiv.org/abs/2104.13188
-    Title: Rethinking BiSeNet For Real-time Semantic Segmentation
-  README: configs/stdc/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/stdc.py#L394
-    Version: v0.20.0
-  Converted From:
-    Code: https://github.com/MichaelFan01/STDC-Seg
-Models:
-- Name: stdc1_512x1024_80k_cityscapes
-  In Collection: STDC
-  Metadata:
-    backbone: STDC1
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 43.37
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.15
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 71.82
-      mIoU(ms+flip): 73.89
-  Config: configs/stdc/stdc1_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_512x1024_80k_cityscapes/stdc1_512x1024_80k_cityscapes_20220224_073048-74e6920a.pth
-- Name: stdc1_in1k-pre_512x1024_80k_cityscapes
-  In Collection: STDC
-  Metadata:
-    backbone: STDC1
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 74.94
-      mIoU(ms+flip): 76.97
-  Config: configs/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes/stdc1_in1k-pre_512x1024_80k_cityscapes_20220224_141648-3d4c2981.pth
-- Name: stdc2_512x1024_80k_cityscapes
-  In Collection: STDC
-  Metadata:
-    backbone: STDC2
-    crop size: (512,1024)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 42.18
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 8.27
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 73.15
-      mIoU(ms+flip): 76.13
-  Config: configs/stdc/stdc2_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_512x1024_80k_cityscapes/stdc2_512x1024_80k_cityscapes_20220222_132015-fb1e3a1a.pth
-- Name: stdc2_in1k-pre_512x1024_80k_cityscapes
-  In Collection: STDC
-  Metadata:
-    backbone: STDC2
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 76.67
-      mIoU(ms+flip): 78.67
-  Config: configs/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes/stdc2_in1k-pre_512x1024_80k_cityscapes_20220224_073048-1f8f0f6c.pth
diff --git a/configs/stdc/stdc1_512x1024_80k_cityscapes.py b/configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/stdc/stdc1_512x1024_80k_cityscapes.py
rename to configs/stdc/stdc1_4xb12-80k_cityscapes-512x1024.py
diff --git a/configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py b/configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..15e807f9ed
--- /dev/null
+++ b/configs/stdc/stdc1_in1k-pre_4xb12-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/stdc/stdc1_20220308-5368626c.pth'  # noqa
+_base_ = './stdc1_4xb12-80k_cityscapes-512x1024.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(type='Pretrained', checkpoint=checkpoint))))
diff --git a/configs/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes.py b/configs/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes.py
deleted file mode 100644
index f295bf494e..0000000000
--- a/configs/stdc/stdc1_in1k-pre_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,6 +0,0 @@
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/stdc/stdc1_20220308-5368626c.pth'  # noqa
-_base_ = './stdc1_512x1024_80k_cityscapes.py'
-model = dict(
-    backbone=dict(
-        backbone_cfg=dict(
-            init_cfg=dict(type='Pretrained', checkpoint=checkpoint))))
diff --git a/configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py b/configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..5657351698
--- /dev/null
+++ b/configs/stdc/stdc2_4xb12-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './stdc1_4xb12-80k_cityscapes-512x1024.py'
+model = dict(backbone=dict(backbone_cfg=dict(stdc_type='STDCNet2')))
diff --git a/configs/stdc/stdc2_512x1024_80k_cityscapes.py b/configs/stdc/stdc2_512x1024_80k_cityscapes.py
deleted file mode 100644
index f7afb506a0..0000000000
--- a/configs/stdc/stdc2_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './stdc1_512x1024_80k_cityscapes.py'
-model = dict(backbone=dict(backbone_cfg=dict(stdc_type='STDCNet2')))
diff --git a/configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py b/configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..05a202b74c
--- /dev/null
+++ b/configs/stdc/stdc2_in1k-pre_4xb12-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/stdc/stdc2_20220308-7dbd9127.pth'  # noqa
+_base_ = './stdc2_4xb12-80k_cityscapes-512x1024.py'
+model = dict(
+    backbone=dict(
+        backbone_cfg=dict(
+            init_cfg=dict(type='Pretrained', checkpoint=checkpoint))))
diff --git a/configs/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes.py b/configs/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes.py
deleted file mode 100644
index 4148ac4fd0..0000000000
--- a/configs/stdc/stdc2_in1k-pre_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,6 +0,0 @@
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/stdc/stdc2_20220308-7dbd9127.pth'  # noqa
-_base_ = './stdc2_512x1024_80k_cityscapes.py'
-model = dict(
-    backbone=dict(
-        backbone_cfg=dict(
-            init_cfg=dict(type='Pretrained', checkpoint=checkpoint))))
diff --git a/configs/swin/README.md b/configs/swin/README.md
index 6b21b6d1bc..18fcbae8bc 100644
--- a/configs/swin/README.md
+++ b/configs/swin/README.md
@@ -1,6 +1,6 @@
 # Swin Transformer
 
-[Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
+> [Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
 
 ## Introduction
 
@@ -22,17 +22,6 @@ This paper presents a new vision Transformer, called Swin Transformer, that capa
 <img src="https://user-images.githubusercontent.com/24582831/142902882-3fb9014c-11b6-47e9-aa14-500dfe7cbb1c.png" width="80%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{liu2021Swin,
-  title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
-  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
-  journal={arXiv preprint arXiv:2103.14030},
-  year={2021}
-}
-```
-
 ## Usage
 
 We have provided pretrained models converted from [official repo](https://github.com/microsoft/Swin-Transformer)．
@@ -66,11 +55,22 @@ In our default setting, pretrained models and their corresponding [original mode
 
 ### ADE20K
 
-| Method  | Backbone | Crop Size | pretrain     | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
-| ------- | -------- | --------- | ------------ | ----------------- | ---------- | ------- | -------- | -------------- | ----- | ------------: | ------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UPerNet | Swin-T   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 5.02     | 21.06          | 44.41 |         45.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542.log.json)         |
-| UPerNet | Swin-S   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 6.17     | 14.72          | 47.72 |         49.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015.log.json)     |
-| UPerNet | Swin-B   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 7.61     | 12.65          | 47.99 |         49.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340-593b0e13.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340.log.json)         |
-| UPerNet | Swin-B   | 512x512   | ImageNet-22K | 224x224           | 16         | 160000  | -        | -              | 50.31 |          51.9 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650-762e2178.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650.log.json)     |
-| UPerNet | Swin-B   | 512x512   | ImageNet-1K  | 384x384           | 16         | 160000  | 8.52     | 12.10          | 48.35 |         49.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020-05b22ea4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020.log.json)     |
-| UPerNet | Swin-B   | 512x512   | ImageNet-22K | 384x384           | 16         | 160000  | -        | -              | 50.76 |          52.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459.log.json) |
+| Method  | Backbone | Crop Size | pretrain     | pretrain img size | Batch Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             |
+| ------- | -------- | --------- | ------------ | ----------------- | ---------- | ------- | -------- | -------------- | ------ | ----- | ------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | Swin-T   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 5.02     | 21.06          | V100   | 44.41 |         45.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542.log.json)         |
+| UPerNet | Swin-S   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 6.17     | 14.72          | V100   | 47.72 |         49.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015.log.json)     |
+| UPerNet | Swin-B   | 512x512   | ImageNet-1K  | 224x224           | 16         | 160000  | 7.61     | 12.65          | V100   | 47.99 |         49.57 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py)           | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340-593b0e13.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340.log.json)         |
+| UPerNet | Swin-B   | 512x512   | ImageNet-22K | 224x224           | 16         | 160000  | -        | -              | V100   | 50.13 |          51.9 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py)          | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650-762e2178.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650.log.json)     |
+| UPerNet | Swin-B   | 512x512   | ImageNet-1K  | 384x384           | 16         | 160000  | 8.52     | 12.10          | V100   | 48.35 |         49.65 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020-05b22ea4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020.log.json)     |
+| UPerNet | Swin-B   | 512x512   | ImageNet-22K | 384x384           | 16         | 160000  | -        | -              | V100   | 50.76 |          52.4 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459.log.json) |
+
+## Citation
+
+```bibtex
+@article{liu2021Swin,
+  title={Swin Transformer: Hierarchical Vision Transformer using Shifted Windows},
+  author={Liu, Ze and Lin, Yutong and Cao, Yue and Hu, Han and Wei, Yixuan and Zhang, Zheng and Lin, Stephen and Guo, Baining},
+  journal={arXiv preprint arXiv:2103.14030},
+  year={2021}
+}
+```
diff --git a/configs/swin/metafile.yaml b/configs/swin/metafile.yaml
new file mode 100644
index 0000000000..67a4e07551
--- /dev/null
+++ b/configs/swin/metafile.yaml
@@ -0,0 +1,143 @@
+Models:
+- Name: swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.41
+      mIoU(ms+flip): 45.79
+  Config: configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-T
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.02
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.72
+      mIoU(ms+flip): 49.24
+  Config: configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.17
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.99
+      mIoU(ms+flip): 49.57
+  Config: configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.61
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340-593b0e13.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 50.13
+      mIoU(ms+flip): 51.9
+  Config: configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650-762e2178.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.35
+      mIoU(ms+flip): 49.65
+  Config: configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.52
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020-05b22ea4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
+- Name: swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 50.76
+      mIoU(ms+flip): 52.4
+  Config: configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Swin-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459.log.json
+  Paper:
+    Title: 'Swin Transformer: Hierarchical Vision Transformer using Shifted Windows'
+    URL: https://arxiv.org/abs/2103.14030
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/swin.py#L524
+  Framework: PyTorch
diff --git a/configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py b/configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..11cea36703
--- /dev/null
+++ b/configs/swin/swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,14 @@
+_base_ = [
+    'swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        pretrain_img_size=384,
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32],
+        window_size=12),
+    decode_head=dict(in_channels=[128, 256, 512, 1024], num_classes=150),
+    auxiliary_head=dict(in_channels=512, num_classes=150))
diff --git a/configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py b/configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..5c1171646e
--- /dev/null
+++ b/configs/swin/swin-base-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = [
+    './swin-base-patch4-window12-in1k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py'  # noqa
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)))
diff --git a/configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py b/configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..73bf6166ef
--- /dev/null
+++ b/configs/swin/swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = [
+    './swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_20220317-e9b98025.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        embed_dims=128,
+        depths=[2, 2, 18, 2],
+        num_heads=[4, 8, 16, 32]),
+    decode_head=dict(in_channels=[128, 256, 512, 1024], num_classes=150),
+    auxiliary_head=dict(in_channels=512, num_classes=150))
diff --git a/configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py b/configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..96148cd71d
--- /dev/null
+++ b/configs/swin/swin-base-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,7 @@
+_base_ = [
+    './swin-base-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_22k_20220317-4f79f7c0.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)))
diff --git a/configs/swin/swin-large-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py b/configs/swin/swin-large-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..a0a654e026
--- /dev/null
+++ b/configs/swin/swin-large-patch4-window12-in22k-384x384-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    'swin-large-patch4-window7-in22k-pre_upernet_'
+    '8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window12_384_22k_20220412-6580f57d.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        pretrain_img_size=384,
+        window_size=12))
diff --git a/configs/swin/swin-large-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py b/configs/swin/swin-large-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..c93cdfeaae
--- /dev/null
+++ b/configs/swin/swin-large-patch4-window7-in22k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,15 @@
+_base_ = [
+    'swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_'
+    'ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_large_patch4_window7_224_22k_20220412-aeecf2aa.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        pretrain_img_size=224,
+        embed_dims=192,
+        depths=[2, 2, 18, 2],
+        num_heads=[6, 12, 24, 48],
+        window_size=7),
+    decode_head=dict(in_channels=[192, 384, 768, 1536], num_classes=150),
+    auxiliary_head=dict(in_channels=768, num_classes=150))
diff --git a/configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py b/configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..19863dfc82
--- /dev/null
+++ b/configs/swin/swin-small-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    './swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py'
+]
+checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
+        depths=[2, 2, 18, 2]),
+    decode_head=dict(in_channels=[96, 192, 384, 768], num_classes=150),
+    auxiliary_head=dict(in_channels=384, num_classes=150))
diff --git a/configs/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py b/configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
similarity index 100%
rename from configs/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
rename to configs/swin/swin-tiny-patch4-window7-in1k-pre_upernet_8xb2-160k_ade20k-512x512.py
diff --git a/configs/swin/swin-tiny-patch4-window7_upernet_1xb8-20k_levir-256x256.py b/configs/swin/swin-tiny-patch4-window7_upernet_1xb8-20k_levir-256x256.py
new file mode 100644
index 0000000000..663f769d73
--- /dev/null
+++ b/configs/swin/swin-tiny-patch4-window7_upernet_1xb8-20k_levir-256x256.py
@@ -0,0 +1,56 @@
+_base_ = [
+    '../_base_/models/upernet_swin.py', '../_base_/datasets/levir_256x256.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_20k.py'
+]
+crop_size = (256, 256)
+norm_cfg = dict(type='BN', requires_grad=True)
+data_preprocessor = dict(
+    size=crop_size,
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53, 123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375, 58.395, 57.12, 57.375])
+
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        in_channels=6,
+        embed_dims=96,
+        depths=[2, 2, 6, 2],
+        num_heads=[3, 6, 12, 24],
+        window_size=7,
+        use_abs_pos_embed=False,
+        drop_path_rate=0.3,
+        patch_norm=True),
+    decode_head=dict(in_channels=[96, 192, 384, 768], num_classes=2),
+    auxiliary_head=dict(in_channels=384, num_classes=2))
+
+# AdamW optimizer, no weight decay for position embedding & layer norm
+# in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        eta_min=0.0,
+        power=1.0,
+        begin=1500,
+        end=20000,
+        by_epoch=False,
+    )
+]
+
+train_dataloader = dict(batch_size=4)
+val_dataloader = dict(batch_size=1)
+test_dataloader = val_dataloader
diff --git a/configs/swin/swin.yml b/configs/swin/swin.yml
deleted file mode 100644
index ef21d2165e..0000000000
--- a/configs/swin/swin.yml
+++ /dev/null
@@ -1,117 +0,0 @@
-Models:
-- Name: upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K
-  In Collection: UPerNet
-  Metadata:
-    backbone: Swin-T
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 47.48
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.02
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.41
-      mIoU(ms+flip): 45.79
-  Config: configs/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210531_112542-e380ad3e.pth
-- Name: upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K
-  In Collection: UPerNet
-  Metadata:
-    backbone: Swin-S
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 67.93
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.17
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 47.72
-      mIoU(ms+flip): 49.24
-  Config: configs/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192015-ee2fff1c.pth
-- Name: upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K
-  In Collection: UPerNet
-  Metadata:
-    backbone: Swin-B
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 79.05
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.61
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 47.99
-      mIoU(ms+flip): 49.57
-  Config: configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K_20210526_192340-593b0e13.pth
-- Name: upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K
-  In Collection: UPerNet
-  Metadata:
-    backbone: Swin-B
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 50.31
-      mIoU(ms+flip): 51.9
-  Config: configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K_20210526_211650-762e2178.pth
-- Name: upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K
-  In Collection: UPerNet
-  Metadata:
-    backbone: Swin-B
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 82.64
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.52
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 48.35
-      mIoU(ms+flip): 49.65
-  Config: configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K_20210531_132020-05b22ea4.pth
-- Name: upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K
-  In Collection: UPerNet
-  Metadata:
-    backbone: Swin-B
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 50.76
-      mIoU(ms+flip): 52.4
-  Config: configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K_20210531_125459-429057bf.pth
diff --git a/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K.py b/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K.py
deleted file mode 100644
index 027bd6f8bc..0000000000
--- a/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_1K.py
+++ /dev/null
@@ -1,15 +0,0 @@
-_base_ = [
-    'upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_'
-    'pretrain_224x224_1K.py'
-]
-checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth'  # noqa
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
-        pretrain_img_size=384,
-        embed_dims=128,
-        depths=[2, 2, 18, 2],
-        num_heads=[4, 8, 16, 32],
-        window_size=12),
-    decode_head=dict(in_channels=[128, 256, 512, 1024], num_classes=150),
-    auxiliary_head=dict(in_channels=512, num_classes=150))
diff --git a/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K.py b/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K.py
deleted file mode 100644
index e662d4f03a..0000000000
--- a/configs/swin/upernet_swin_base_patch4_window12_512x512_160k_ade20k_pretrain_384x384_22K.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = [
-    './upernet_swin_base_patch4_window12_512x512_160k_ade20k_'
-    'pretrain_384x384_1K.py'
-]
-checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_22k_20220317-e5c09f74.pth'  # noqa
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)))
diff --git a/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py b/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
deleted file mode 100644
index 6e05677d89..0000000000
--- a/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
+++ /dev/null
@@ -1,13 +0,0 @@
-_base_ = [
-    './upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_'
-    'pretrain_224x224_1K.py'
-]
-checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_20220317-e9b98025.pth'  # noqa
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
-        embed_dims=128,
-        depths=[2, 2, 18, 2],
-        num_heads=[4, 8, 16, 32]),
-    decode_head=dict(in_channels=[128, 256, 512, 1024], num_classes=150),
-    auxiliary_head=dict(in_channels=512, num_classes=150))
diff --git a/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K.py b/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K.py
deleted file mode 100644
index 7a9c50624f..0000000000
--- a/configs/swin/upernet_swin_base_patch4_window7_512x512_160k_ade20k_pretrain_224x224_22K.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = [
-    './upernet_swin_base_patch4_window7_512x512_160k_ade20k_'
-    'pretrain_224x224_1K.py'
-]
-checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window7_224_22k_20220317-4f79f7c0.pth'  # noqa
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file)))
diff --git a/configs/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py b/configs/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
deleted file mode 100644
index 1958e0e750..0000000000
--- a/configs/swin/upernet_swin_small_patch4_window7_512x512_160k_ade20k_pretrain_224x224_1K.py
+++ /dev/null
@@ -1,11 +0,0 @@
-_base_ = [
-    './upernet_swin_tiny_patch4_window7_512x512_160k_ade20k_'
-    'pretrain_224x224_1K.py'
-]
-checkpoint_file = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_small_patch4_window7_224_20220317-7ba6d6dd.pth'  # noqa
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint_file),
-        depths=[2, 2, 18, 2]),
-    decode_head=dict(in_channels=[96, 192, 384, 768], num_classes=150),
-    auxiliary_head=dict(in_channels=384, num_classes=150))
diff --git a/configs/twins/README.md b/configs/twins/README.md
index 639d074d32..e4b3735b00 100644
--- a/configs/twins/README.md
+++ b/configs/twins/README.md
@@ -1,6 +1,6 @@
 # Twins
 
-[Twins: Revisiting the Design of Spatial Attention in Vision Transformers](https://arxiv.org/pdf/2104.13840.pdf)
+> [Twins: Revisiting the Design of Spatial Attention in Vision Transformers](https://arxiv.org/pdf/2104.13840.pdf)
 
 ## Introduction
 
@@ -22,17 +22,6 @@ Very recently, a variety of vision transformer architectures for dense predictio
 <img src="https://user-images.githubusercontent.com/24582831/145021310-57826cf5-5e03-4c7c-9081-ffa744bdae27.png" width="80%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{chu2021twins,
-  title={Twins: Revisiting spatial attention design in vision transformers},
-  author={Chu, Xiangxiang and Tian, Zhi and Wang, Yuqing and Zhang, Bo and Ren, Haibing and Wei, Xiaolin and Xia, Huaxia and Shen, Chunhua},
-  journal={arXiv preprint arXiv:2104.13840},
-  year={2021}altgvt
-}
-```
-
 ## Usage
 
 We have provided pretrained models converted from [official repo](https://github.com/Meituan-AutoML/Twins).
@@ -55,22 +44,33 @@ python tools/model_converters/twins2mmseg.py ./alt_gvt_base.pth ./pretrained/alt
 
 ### ADE20K
 
-| Method              | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                               | download                                                                                                                                                                                                                                                                                                                                                                                       |
-| ------------------- | -------- | --------- | ------- | -------- | -------------- | ----- | ------------- | ------------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Twins-FPN           | PCPVT-S  | 512x512   | 80000   | 6.60     | 27.15          | 43.26 | 44.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132-41acd132.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132.log.json) |
-| Twins-UPerNet       | PCPVT-S  | 512x512   | 160000  | 9.67     | 14.24          | 46.04 | 46.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537-8e99c07a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537.log.json)         |
-| Twins-FPN           | PCPVT-B  | 512x512   | 80000   | 8.41     | 19.67          | 45.66 | 46.48         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019-d396db72.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019.log.json) |
-| Twins-UPerNet (8x2) | PCPVT-B  | 512x512   | 160000  | 6.46     | 12.04          | 47.91 | 48.64         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020-02094ea5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020.log.json)         |
-| Twins-FPN           | PCPVT-L  | 512x512   | 80000   | 10.78    | 14.32          | 45.94 | 46.70         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226-bc6d61dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226.log.json) |
-| Twins-UPerNet (8x2) | PCPVT-L  | 512x512   | 160000  | 7.82     | 10.70          | 49.35 | 50.08         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053-c6095c07.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053.log.json)         |
-| Twins-FPN           | SVT-S    | 512x512   | 80000   | 5.80     | 29.79          | 44.47 | 45.42         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006-0a0d3317.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006.log.json)         |
-| Twins-UPerNet (8x2) | SVT-S    | 512x512   | 160000  | 4.93     | 15.09          | 46.08 | 46.96         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005-e48a2d94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json)                 |
-| Twins-FPN           | SVT-B    | 512x512   | 80000   | 8.75     | 21.10          | 46.77 | 47.47         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849-88b2907c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849.log.json)         |
-| Twins-UPerNet (8x2) | SVT-B    | 512x512   | 160000  | 6.77     | 12.66          | 48.04 | 48.87         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826-0943a1f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826.log.json)                 |
-| Twins-FPN           | SVT-L    | 512x512   | 80000   | 11.20    | 17.80          | 46.55 | 47.74         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005-1d59bee2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005.log.json)         |
-| Twins-UPerNet (8x2) | SVT-L    | 512x512   | 160000  | 8.41     | 10.73          | 49.65 | 50.63         | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005-3e2cae61.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json)                 |
+| Method  | Backbone            | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | mIoU  | mIoU(ms+flip) | config                                                                                                                              | download                                                                                                                                                                                                                                                                                                                                                                                       |
+| ------- | ------------------- | --------- | ------- | -------- | -------------- | ------ | ----- | ------------- | ----------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FPN     | Twins-PCPVT-S       | 512x512   | 80000   | 6.60     | 27.15          | V100   | 43.26 | 44.11         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132-41acd132.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132.log.json) |
+| UPerNet | Twins-PCPVT-S       | 512x512   | 160000  | 9.67     | 14.24          | V100   | 46.04 | 46.92         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537-8e99c07a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537.log.json)         |
+| FPN     | Twins-PCPVT-B       | 512x512   | 80000   | 8.41     | 19.67          | V100   | 45.66 | 46.48         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019-d396db72.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019.log.json) |
+| UPerNet | Twins-PCPVT-B (8x2) | 512x512   | 160000  | 6.46     | 12.04          | V100   | 47.91 | 48.64         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020-02094ea5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020.log.json)         |
+| FPN     | Twins-PCPVT-L       | 512x512   | 80000   | 10.78    | 14.32          | V100   | 45.94 | 46.70         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226-bc6d61dc.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226.log.json) |
+| UPerNet | Twins-PCPVT-L (8x2) | 512x512   | 160000  | 7.82     | 10.70          | V100   | 49.35 | 50.08         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053-c6095c07.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053.log.json)         |
+| FPN     | Twins-SVT-S         | 512x512   | 80000   | 5.80     | 29.79          | V100   | 44.47 | 45.42         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006-0a0d3317.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006.log.json)         |
+| UPerNet | SVT-S  (8x2)        | 512x512   | 160000  | 4.93     | 15.09          | V100   | 46.08 | 46.96         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005-e48a2d94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json)                 |
+| FPN     | Twins-SVT-B         | 512x512   | 80000   | 8.75     | 21.10          | V100   | 46.77 | 47.47         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849-88b2907c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849.log.json)         |
+| UPerNet | Twins-SVT-B  (8x2)  | 512x512   | 160000  | 6.77     | 12.66          | V100   | 48.04 | 48.87         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826-0943a1f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826.log.json)                 |
+| FPN     | Twins-SVT-L         | 512x512   | 80000   | 11.20    | 17.80          | V100   | 46.55 | 47.74         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005-1d59bee2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005.log.json)         |
+| UPerNet | Twins-SVT-L  (8x2)  | 512x512   | 160000  | 8.41     | 10.73          | V100   | 49.65 | 50.63         | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005-3e2cae61.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json)                 |
 
 Note:
 
 - `8x2` means 8 GPUs with 2 samples per GPU in training. Default setting of Twins on ADE20K is 8 GPUs with 4 samples per GPU in training.
 - `UPerNet` and `FPN` are decoder heads utilized in corresponding Twins model, which is `UPerHead` and `FPNHead`, respectively. Specifically, models in [official repo](https://github.com/Meituan-AutoML/Twins) all use `UPerHead`.
+
+## Citation
+
+```bibtex
+@article{chu2021twins,
+  title={Twins: Revisiting spatial attention design in vision transformers},
+  author={Chu, Xiangxiang and Tian, Zhi and Wang, Yuqing and Zhang, Bo and Ren, Haibing and Wei, Xiaolin and Xia, Huaxia and Shen, Chunhua},
+  journal={arXiv preprint arXiv:2104.13840},
+  year={2021}altgvt
+}
+```
diff --git a/configs/twins/metafile.yaml b/configs/twins/metafile.yaml
new file mode 100644
index 0000000000..0de78d9d2e
--- /dev/null
+++ b/configs/twins/metafile.yaml
@@ -0,0 +1,289 @@
+Models:
+- Name: twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.26
+      mIoU(ms+flip): 44.11
+  Config: configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-PCPVT-S
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.6
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132-41acd132.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.04
+      mIoU(ms+flip): 46.92
+  Config: configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-PCPVT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.67
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537-8e99c07a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.66
+      mIoU(ms+flip): 46.48
+  Config: configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-PCPVT-B
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.41
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019-d396db72.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.91
+      mIoU(ms+flip): 48.64
+  Config: configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Twins-PCPVT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.46
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020-02094ea5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.94
+      mIoU(ms+flip): 46.7
+  Config: configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-PCPVT-L
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 10.78
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226-bc6d61dc.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.35
+      mIoU(ms+flip): 50.08
+  Config: configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Twins-PCPVT-L
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.82
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053-c6095c07.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 44.47
+      mIoU(ms+flip): 45.42
+  Config: configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-SVT-S
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.8
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006-0a0d3317.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-s_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.08
+      mIoU(ms+flip): 46.96
+  Config: configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - SVT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.93
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005-e48a2d94.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.77
+      mIoU(ms+flip): 47.47
+  Config: configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-SVT-B
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.75
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849-88b2907c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-b_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 48.04
+      mIoU(ms+flip): 48.87
+  Config: configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Twins-SVT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 6.77
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826-0943a1f1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512
+  In Collection: FPN
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.55
+      mIoU(ms+flip): 47.74
+  Config: configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 32
+    Architecture:
+    - Twins-SVT-L
+    - FPN
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 11.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005-1d59bee2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
+- Name: twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 49.65
+      mIoU(ms+flip): 50.63
+  Config: configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - Twins-SVT-L
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 8.41
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005-3e2cae61.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005.log.json
+  Paper:
+    Title: 'Twins: Revisiting the Design of Spatial Attention in Vision Transformers'
+    URL: https://arxiv.org/pdf/2104.13840.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.20.0/mmseg/models/backbones/twins.py#L352
+  Framework: PyTorch
diff --git a/configs/twins/twins.yml b/configs/twins/twins.yml
deleted file mode 100644
index 6b5f5c181b..0000000000
--- a/configs/twins/twins.yml
+++ /dev/null
@@ -1,265 +0,0 @@
-Models:
-- Name: twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k
-  In Collection: FPN
-  Metadata:
-    backbone: PCPVT-S
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 36.83
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.6
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.26
-      mIoU(ms+flip): 44.11
-  Config: configs/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_204132-41acd132.pth
-- Name: twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: PCPVT-S
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 70.22
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.67
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.04
-      mIoU(ms+flip): 46.92
-  Config: configs/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k_20211201_233537-8e99c07a.pth
-- Name: twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k
-  In Collection: FPN
-  Metadata:
-    backbone: PCPVT-B
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 50.84
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.41
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.66
-      mIoU(ms+flip): 46.48
-  Config: configs/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141019-d396db72.pth
-- Name: twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: PCPVT-B
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 83.06
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.46
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 47.91
-      mIoU(ms+flip): 48.64
-  Config: configs/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k_20211130_141020-02094ea5.pth
-- Name: twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k
-  In Collection: FPN
-  Metadata:
-    backbone: PCPVT-L
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 69.83
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 10.78
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.94
-      mIoU(ms+flip): 46.7
-  Config: configs/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_105226-bc6d61dc.pth
-- Name: twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: PCPVT-L
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 93.46
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.82
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 49.35
-      mIoU(ms+flip): 50.08
-  Config: configs/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k_20211201_075053-c6095c07.pth
-- Name: twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k
-  In Collection: FPN
-  Metadata:
-    backbone: SVT-S
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 33.57
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.8
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 44.47
-      mIoU(ms+flip): 45.42
-  Config: configs/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141006-0a0d3317.pth
-- Name: twins_svt-s_uperhead_8x2_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: SVT-S
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 66.27
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 4.93
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.08
-      mIoU(ms+flip): 46.96
-  Config: configs/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k/twins_svt-s_uperhead_8x2_512x512_160k_ade20k_20211130_141005-e48a2d94.pth
-- Name: twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k
-  In Collection: FPN
-  Metadata:
-    backbone: SVT-B
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 47.39
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.75
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.77
-      mIoU(ms+flip): 47.47
-  Config: configs/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k_20211201_113849-88b2907c.pth
-- Name: twins_svt-b_uperhead_8x2_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: SVT-B
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 78.99
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.77
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 48.04
-      mIoU(ms+flip): 48.87
-  Config: configs/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k/twins_svt-b_uperhead_8x2_512x512_160k_ade20k_20211202_040826-0943a1f1.pth
-- Name: twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k
-  In Collection: FPN
-  Metadata:
-    backbone: SVT-L
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 56.18
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 11.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.55
-      mIoU(ms+flip): 47.74
-  Config: configs/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k_20211130_141005-1d59bee2.pth
-- Name: twins_svt-l_uperhead_8x2_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: SVT-L
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 93.2
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.41
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 49.65
-      mIoU(ms+flip): 50.63
-  Config: configs/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k/twins_svt-l_uperhead_8x2_512x512_160k_ade20k_20211130_141005-3e2cae61.pth
diff --git a/configs/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py b/configs/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py
deleted file mode 100644
index b79fefd4a5..0000000000
--- a/configs/twins/twins_pcpvt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py']
-
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_base_20220308-0621964c.pth'  # noqa
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
-        depths=[3, 4, 18, 3]), )
diff --git a/configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..4739ad4b0a
--- /dev/null
+++ b/configs/twins/twins_pcpvt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = ['./twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_base_20220308-0621964c.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        depths=[3, 4, 18, 3]), )
diff --git a/configs/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k.py b/configs/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k.py
deleted file mode 100644
index b9a3d0681c..0000000000
--- a/configs/twins/twins_pcpvt-b_uperhead_8x2_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,13 +0,0 @@
-_base_ = ['./twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k.py']
-
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_base_20220308-0621964c.pth'  # noqa
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
-        depths=[3, 4, 18, 3],
-        drop_path_rate=0.3))
-
-train_dataloader = dict(batch_size=2, num_workers=2)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py b/configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..ba9748547d
--- /dev/null
+++ b/configs/twins/twins_pcpvt-b_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = ['./twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_base_20220308-0621964c.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        depths=[3, 4, 18, 3],
+        drop_path_rate=0.3))
+
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py b/configs/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py
deleted file mode 100644
index abb652e8e0..0000000000
--- a/configs/twins/twins_pcpvt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = ['./twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py']
-
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_large_20220308-37579dc6.pth'  # noqa
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
-        depths=[3, 8, 27, 3]))
diff --git a/configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..bff7c41946
--- /dev/null
+++ b/configs/twins/twins_pcpvt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = ['./twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_large_20220308-37579dc6.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        depths=[3, 8, 27, 3]))
diff --git a/configs/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k.py b/configs/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k.py
deleted file mode 100644
index a3e37ef2ae..0000000000
--- a/configs/twins/twins_pcpvt-l_uperhead_8x2_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,13 +0,0 @@
-_base_ = ['./twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k.py']
-
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_large_20220308-37579dc6.pth'  # noqa
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
-        depths=[3, 8, 27, 3],
-        drop_path_rate=0.3))
-
-train_dataloader = dict(batch_size=2, num_workers=2)
-val_dataloader = dict(batch_size=1, num_workers=4)
-test_dataloader = val_dataloader
diff --git a/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py b/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..666ff5b69c
--- /dev/null
+++ b/configs/twins/twins_pcpvt-l_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = ['./twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/pcpvt_large_20220308-37579dc6.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        depths=[3, 8, 27, 3],
+        drop_path_rate=0.3))
+
+train_dataloader = dict(batch_size=2, num_workers=2)
+val_dataloader = dict(batch_size=1, num_workers=4)
+test_dataloader = val_dataloader
diff --git a/configs/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py b/configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/twins/twins_pcpvt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py
rename to configs/twins/twins_pcpvt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
diff --git a/configs/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k.py b/configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/twins/twins_pcpvt-s_uperhead_8x4_512x512_160k_ade20k.py
rename to configs/twins/twins_pcpvt-s_uperhead_8xb4-160k_ade20k-512x512.py
diff --git a/configs/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py b/configs/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py
deleted file mode 100644
index 00d89572c6..0000000000
--- a/configs/twins/twins_svt-b_fpn_fpnhead_8x4_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = ['./twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py']
-
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_base_20220308-1b7eb711.pth'  # noqa
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
-        embed_dims=[96, 192, 384, 768],
-        num_heads=[3, 6, 12, 24],
-        depths=[2, 2, 18, 2]),
-    neck=dict(in_channels=[96, 192, 384, 768]),
-)
diff --git a/configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..5e9fa00f88
--- /dev/null
+++ b/configs/twins/twins_svt-b_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_base_20220308-1b7eb711.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[96, 192, 384, 768],
+        num_heads=[3, 6, 12, 24],
+        depths=[2, 2, 18, 2]),
+    neck=dict(in_channels=[96, 192, 384, 768]),
+)
diff --git a/configs/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k.py b/configs/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k.py
deleted file mode 100644
index a969fedfed..0000000000
--- a/configs/twins/twins_svt-b_uperhead_8x2_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,12 +0,0 @@
-_base_ = ['./twins_svt-s_uperhead_8x2_512x512_160k_ade20k.py']
-
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_base_20220308-1b7eb711.pth'  # noqa
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
-        embed_dims=[96, 192, 384, 768],
-        num_heads=[3, 6, 12, 24],
-        depths=[2, 2, 18, 2]),
-    decode_head=dict(in_channels=[96, 192, 384, 768]),
-    auxiliary_head=dict(in_channels=384))
diff --git a/configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py b/configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..6ce2361f5f
--- /dev/null
+++ b/configs/twins/twins_svt-b_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,12 @@
+_base_ = ['./twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_base_20220308-1b7eb711.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[96, 192, 384, 768],
+        num_heads=[3, 6, 12, 24],
+        depths=[2, 2, 18, 2]),
+    decode_head=dict(in_channels=[96, 192, 384, 768]),
+    auxiliary_head=dict(in_channels=384))
diff --git a/configs/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py b/configs/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py
deleted file mode 100644
index c68bfd4a17..0000000000
--- a/configs/twins/twins_svt-l_fpn_fpnhead_8x4_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,13 +0,0 @@
-_base_ = ['./twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py']
-
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_large_20220308-fb5936f3.pth'  # noqa
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
-        embed_dims=[128, 256, 512, 1024],
-        num_heads=[4, 8, 16, 32],
-        depths=[2, 2, 18, 2],
-        drop_path_rate=0.3),
-    neck=dict(in_channels=[128, 256, 512, 1024]),
-)
diff --git a/configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py b/configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..b7e5f9cdb8
--- /dev/null
+++ b/configs/twins/twins_svt-l_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = ['./twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_large_20220308-fb5936f3.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[128, 256, 512, 1024],
+        num_heads=[4, 8, 16, 32],
+        depths=[2, 2, 18, 2],
+        drop_path_rate=0.3),
+    neck=dict(in_channels=[128, 256, 512, 1024]),
+)
diff --git a/configs/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k.py b/configs/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k.py
deleted file mode 100644
index f98c070b2d..0000000000
--- a/configs/twins/twins_svt-l_uperhead_8x2_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,13 +0,0 @@
-_base_ = ['./twins_svt-s_uperhead_8x2_512x512_160k_ade20k.py']
-
-checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_large_20220308-fb5936f3.pth'  # noqa
-
-model = dict(
-    backbone=dict(
-        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
-        embed_dims=[128, 256, 512, 1024],
-        num_heads=[4, 8, 16, 32],
-        depths=[2, 2, 18, 2],
-        drop_path_rate=0.3),
-    decode_head=dict(in_channels=[128, 256, 512, 1024]),
-    auxiliary_head=dict(in_channels=512))
diff --git a/configs/twins/twins_svt-l_uperhead_8xb2-160k_ade20k-512x512.py b/configs/twins/twins_svt-l_uperhead_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..69c69df3b5
--- /dev/null
+++ b/configs/twins/twins_svt-l_uperhead_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,13 @@
+_base_ = ['./twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py']
+
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/twins/alt_gvt_large_20220308-fb5936f3.pth'  # noqa
+
+model = dict(
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        embed_dims=[128, 256, 512, 1024],
+        num_heads=[4, 8, 16, 32],
+        depths=[2, 2, 18, 2],
+        drop_path_rate=0.3),
+    decode_head=dict(in_channels=[128, 256, 512, 1024]),
+    auxiliary_head=dict(in_channels=512))
diff --git a/configs/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py b/configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/twins/twins_svt-s_fpn_fpnhead_8x4_512x512_80k_ade20k.py
rename to configs/twins/twins_svt-s_fpn_fpnhead_8xb4-80k_ade20k-512x512.py
diff --git a/configs/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k.py b/configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py
similarity index 100%
rename from configs/twins/twins_svt-s_uperhead_8x2_512x512_160k_ade20k.py
rename to configs/twins/twins_svt-s_uperhead_8xb2-160k_ade20k-512x512.py
diff --git a/configs/unet/README.md b/configs/unet/README.md
index f17e1747d7..7225fbbf68 100644
--- a/configs/unet/README.md
+++ b/configs/unet/README.md
@@ -1,6 +1,6 @@
 # UNet
 
-[U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)
+> [U-Net: Convolutional Networks for Biomedical Image Segmentation](https://arxiv.org/abs/1505.04597)
 
 ## Introduction
 
@@ -22,71 +22,71 @@ There is large consent that successful training of deep networks requires many t
 <img src="https://user-images.githubusercontent.com/24582831/142902977-20fe689d-a147-4d92-9690-dbfde8b68dbe.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@inproceedings{ronneberger2015u,
-  title={U-net: Convolutional networks for biomedical image segmentation},
-  author={Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
-  booktitle={International Conference on Medical image computing and computer-assisted intervention},
-  pages={234--241},
-  year={2015},
-  organization={Springer}
-}
-```
-
 ## Results and models
 
 ### Cityscapes
 
-| Method     | Backbone    | Loss          | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                          | download                                                                                                                                                                                                                                                                                                                                                                     |
-| ---------- | ----------- | ------------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UNet + FCN | UNet-S5-D16 | Cross Entropy | 512x1024  |  160000 | 17.91    | 3.05           | 69.10 |         71.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204-6860854e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204.log.json) |
+| Method     | Backbone    | Loss          | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                     |
+| ---------- | ----------- | ------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UNet + FCN | UNet-S5-D16 | Cross Entropy | 512x1024  |  160000 | 17.91    | 3.05           | V100   | 69.10 |         71.05 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204-6860854e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204.log.json) |
 
 ### DRIVE
 
-| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | mDice |  Dice | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                         |
-| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ----: | ----: | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.680    |              - | 88.38 | 78.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_64x64_40k_drive.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_64x64_40k_drive/fcn_unet_s5-d16_64x64_40k_drive_20201223_191051-5daf6d3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_64x64_40k_drive/unet_s5-d16_64x64_40k_drive-20201223_191051.log.json)                                                                                                 |
-| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.582    |              - | 88.71 | 79.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820-785de5c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820.log.json)                         |
-| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.599    |              - | 88.35 | 78.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/pspnet_unet_s5-d16_64x64_40k_drive.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive_20201227_181818-aac73387.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive-20201227_181818.log.json)                                                                             |
-| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.585    |              - | 88.76 | 79.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821-22b3e3ba.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821.log.json)             |
-| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.596    |              - | 88.38 | 78.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/deeplabv3_unet_s5-d16_64x64_40k_drive.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive_20201226_094047-0671ff20.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive-20201226_094047.log.json)                                                                 |
-| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.582    |              - | 88.84 | 79.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825-6bf0efd7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825.log.json) |
+| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | Device | mDice |  Dice | config                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                                         |
+| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ------ | ----: | ----: | ------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.680    |              - | V100   | 88.38 | 78.67 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_64x64_40k_drive/fcn_unet_s5-d16_64x64_40k_drive_20201223_191051-5daf6d3b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_64x64_40k_drive/unet_s5-d16_64x64_40k_drive-20201223_191051.log.json)                                                                                                 |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.582    |              - | V100   | 88.71 | 79.32 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820-785de5c2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820.log.json)                         |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.599    |              - | V100   | 88.35 | 78.62 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive_20201227_181818-aac73387.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive-20201227_181818.log.json)                                                                             |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.585    |              - | V100   | 88.76 | 79.42 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821-22b3e3ba.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821.log.json)             |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 584x565    | 64x64     |  42x42 | 40000   | 0.596    |              - | V100   | 88.38 | 78.69 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive_20201226_094047-0671ff20.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive-20201226_094047.log.json)                                                                 |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 584x565    | 64x64     |  42x42 | 40000   | 0.582    |              - | V100   | 88.84 | 79.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825-6bf0efd7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825.log.json) |
 
 ### STARE
 
-| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | mDice |  Dice | config                                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ----: | ----: | ------------------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.968    |              - | 89.78 | 81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_128x128_40k_stare.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_stare/fcn_unet_s5-d16_128x128_40k_stare_20201223_191051-7d77e78b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_stare/unet_s5-d16_128x128_40k_stare-20201223_191051.log.json)                                                                                                 |
-| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 0.986    |              - | 90.65 | 82.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821-f75705a9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821.log.json)                         |
-| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.982    |              - | 89.89 | 81.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/pspnet_unet_s5-d16_128x128_40k_stare.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare_20201227_181818-3c2923c4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare-20201227_181818.log.json)                                                                             |
-| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 1.028    |              - | 90.72 | 82.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823-f1063ef7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823.log.json)             |
-| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.999    |              - | 89.73 | 80.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/deeplabv3_unet_s5-d16_128x128_40k_stare.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare_20201226_094047-93dcb93c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare-20201226_094047.log.json)                                                                 |
-| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 1.010    |              - | 90.65 | 82.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825-21db614c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825.log.json) |
+| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | Device | mDice |  Dice | config                                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ------ | ----: | ----: | --------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.968    |              - | V100   | 89.78 | 81.02 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_stare/fcn_unet_s5-d16_128x128_40k_stare_20201223_191051-7d77e78b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_stare/unet_s5-d16_128x128_40k_stare-20201223_191051.log.json)                                                                                                 |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 0.986    |              - | V100   | 90.65 | 82.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821-f75705a9.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821.log.json)                         |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.982    |              - | V100   | 89.89 | 81.22 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare_20201227_181818-3c2923c4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare-20201227_181818.log.json)                                                                             |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 1.028    |              - | V100   | 90.72 | 82.84 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823-f1063ef7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823.log.json)             |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 605x700    | 128x128   |  85x85 | 40000   | 0.999    |              - | V100   | 89.73 | 80.93 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare_20201226_094047-93dcb93c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare-20201226_094047.log.json)                                                                 |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 605x700    | 128x128   |  85x85 | 40000   | 1.010    |              - | V100   | 90.65 | 82.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825-21db614c.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825.log.json) |
 
 ### CHASE_DB1
 
-| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | mDice |  Dice | config                                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                                                                                                 |
-| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ----: | ----: | ---------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.968    |              - | 89.46 | 80.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_128x128_40k_chase_db1.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_chase_db1/fcn_unet_s5-d16_128x128_40k_chase_db1_20201223_191051-11543527.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_chase_db1/unet_s5-d16_128x128_40k_chase_db1-20201223_191051.log.json)                                                                                                 |
-| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 0.986    |              - | 89.52 | 80.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821-1c4eb7cf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821.log.json)                         |
-| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.982    |              - | 89.52 | 80.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1_20201227_181818-68d4e609.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1-20201227_181818.log.json)                                                                             |
-| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 1.028    |              - | 89.45 | 80.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823-c0802c4d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823.log.json)             |
-| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.999    |              - | 89.57 | 80.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1_20201226_094047-4c5aefa3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1-20201226_094047.log.json)                                                                 |
-| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 1.010    |              - | 89.49 | 80.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825-4ef29df5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825.log.json) |
+| Method           | Backbone    | Loss                 | Image Size | Crop Size | Stride | Lr schd | Mem (GB) | Inf time (fps) | Device | mDice |  Dice | config                                                                                                                                            | download                                                                                                                                                                                                                                                                                                                                                                                                                                 |
+| ---------------- | ----------- | -------------------- | ---------- | --------- | -----: | ------- | -------- | -------------: | ------ | ----: | ----: | ------------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.968    |              - | V100   | 89.46 | 80.24 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_chase_db1/fcn_unet_s5-d16_128x128_40k_chase_db1_20201223_191051-11543527.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_chase_db1/unet_s5-d16_128x128_40k_chase_db1-20201223_191051.log.json)                                                                                                 |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 0.986    |              - | V100   | 89.52 | 80.40 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821-1c4eb7cf.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821.log.json)                         |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.982    |              - | V100   | 89.52 | 80.36 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1_20201227_181818-68d4e609.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1-20201227_181818.log.json)                                                                             |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 1.028    |              - | V100   | 89.45 | 80.28 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823-c0802c4d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823.log.json)             |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 960x999    | 128x128   |  85x85 | 40000   | 0.999    |              - | V100   | 89.57 | 80.47 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1_20201226_094047-4c5aefa3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1-20201226_094047.log.json)                                                                 |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 960x999    | 128x128   |  85x85 | 40000   | 1.010    |              - | V100   | 89.49 | 80.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825-4ef29df5.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825.log.json) |
 
 ### HRF
 
-| Method           | Backbone    | Loss                 | Image Size | Crop Size |  Stride | Lr schd | Mem (GB) | Inf time (fps) | mDice |  Dice | config                                                                                                                                   | download                                                                                                                                                                                                                                                                                                                                                                                                         |
-| ---------------- | ----------- | -------------------- | ---------- | --------- | ------: | ------- | -------- | -------------: | ----: | ----: | ---------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.525    |              - | 88.92 | 79.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_256x256_40k_hrf.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_256x256_40k_hrf/fcn_unet_s5-d16_256x256_40k_hrf_20201223_173724-d89cf1ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_256x256_40k_hrf/unet_s5-d16_256x256_40k_hrf-20201223_173724.log.json)                                                                                                 |
-| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.623    |              - | 89.64 | 80.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821-c314da8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821.log.json)                         |
-| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.588    |              - | 89.24 | 80.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/pspnet_unet_s5-d16_256x256_40k_hrf.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf_20201227_181818-fdb7e29b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf-20201227_181818.log.json)                                                                             |
-| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.798    |              - | 89.69 | 80.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823-53d492fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823.log.json)             |
-| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.604    |              - | 89.32 | 80.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf_20201226_094047-3a1fdf85.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf-20201226_094047.log.json)                                                                 |
-| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.607    |              - | 89.56 | 80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032-59daf7a4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032.log.json) |
+| Method           | Backbone    | Loss                 | Image Size | Crop Size |  Stride | Lr schd | Mem (GB) | Inf time (fps) | Device | mDice |  Dice | config                                                                                                                                      | download                                                                                                                                                                                                                                                                                                                                                                                                         |
+| ---------------- | ----------- | -------------------- | ---------- | --------- | ------: | ------- | -------- | -------------: | ------ | ----: | ----: | ------------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.525    |              - | V100   | 88.92 | 79.45 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py)                       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_256x256_40k_hrf/fcn_unet_s5-d16_256x256_40k_hrf_20201223_173724-d89cf1ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_256x256_40k_hrf/unet_s5-d16_256x256_40k_hrf-20201223_173724.log.json)                                                                                                 |
+| UNet + FCN       | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.623    |              - | V100   | 89.64 | 80.87 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821-c314da8a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821.log.json)                         |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.588    |              - | V100   | 89.24 | 80.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py)                    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf_20201227_181818-fdb7e29b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf-20201227_181818.log.json)                                                                             |
+| UNet + PSPNet    | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.798    |              - | V100   | 89.69 | 80.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823-53d492fa.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823.log.json)             |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy        | 2336x3504  | 256x256   | 170x170 | 40000   | 2.604    |              - | V100   | 89.32 | 80.21 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py)                 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf_20201226_094047-3a1fdf85.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf-20201226_094047.log.json)                                                                 |
+| UNet + DeepLabV3 | UNet-S5-D16 | Cross Entropy + Dice | 2336x3504  | 256x256   | 170x170 | 40000   | 2.607    |              - | V100   | 89.56 | 80.71 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032-59daf7a4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032.log.json) |
 
 Note:
 
 - In  `DRIVE`, `STARE`, `CHASE_DB1`, and `HRF` dataset, `mDice` is mean dice of background and vessel, while `Dice` is dice metric of vessel(foreground) only.
+
+## Citation
+
+```bibtex
+@inproceedings{ronneberger2015u,
+  title={U-net: Convolutional networks for biomedical image segmentation},
+  author={Ronneberger, Olaf and Fischer, Philipp and Brox, Thomas},
+  booktitle={International Conference on Medical image computing and computer-assisted intervention},
+  pages={234--241},
+  year={2015},
+  organization={Springer}
+}
+```
diff --git a/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py b/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
deleted file mode 100644
index 1c48cbc22c..0000000000
--- a/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './deeplabv3_unet_s5-d16_128x128_40k_chase_db1.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py b/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
deleted file mode 100644
index 1022edee36..0000000000
--- a/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './deeplabv3_unet_s5-d16_128x128_40k_stare.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py b/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
deleted file mode 100644
index fc17da71ed..0000000000
--- a/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './deeplabv3_unet_s5-d16_256x256_40k_hrf.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py b/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
deleted file mode 100644
index 3f1f12e61e..0000000000
--- a/configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './deeplabv3_unet_s5-d16_64x64_40k_drive.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py b/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
deleted file mode 100644
index 5264866291..0000000000
--- a/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './fcn_unet_s5-d16_128x128_40k_chase_db1.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py b/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
deleted file mode 100644
index cf5fa1f0de..0000000000
--- a/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './fcn_unet_s5-d16_128x128_40k_stare.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py b/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
deleted file mode 100644
index a154d7e689..0000000000
--- a/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './fcn_unet_s5-d16_256x256_40k_hrf.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py b/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
deleted file mode 100644
index 1b8f860bff..0000000000
--- a/configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './fcn_unet_s5-d16_64x64_40k_drive.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/metafile.yaml b/configs/unet/metafile.yaml
new file mode 100644
index 0000000000..1eafbc6d08
--- /dev/null
+++ b/configs/unet/metafile.yaml
@@ -0,0 +1,642 @@
+Collections:
+- Name: UNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - DRIVE
+    - STARE
+    - CHASE_DB1
+    - HRF
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  README: configs/unet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 69.1
+      mIoU(ms+flip): 71.05
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 17.91
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204-6860854e.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.38
+      Dice: 78.67
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.68
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_64x64_40k_drive/fcn_unet_s5-d16_64x64_40k_drive_20201223_191051-5daf6d3b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_64x64_40k_drive/unet_s5-d16_64x64_40k_drive-20201223_191051.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.71
+      Dice: 79.32
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.582
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820-785de5c2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.35
+      Dice: 78.62
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.599
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive_20201227_181818-aac73387.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive-20201227_181818.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.76
+      Dice: 79.42
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.585
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821-22b3e3ba.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.38
+      Dice: 78.69
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.596
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive_20201226_094047-0671ff20.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive-20201226_094047.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: DRIVE
+    Metrics:
+      mDice: 88.84
+      Dice: 79.56
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
+  Metadata:
+    Training Data: DRIVE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.582
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825-6bf0efd7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 89.78
+      Dice: 81.02
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.968
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_stare/fcn_unet_s5-d16_128x128_40k_stare_20201223_191051-7d77e78b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_stare/unet_s5-d16_128x128_40k_stare-20201223_191051.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 90.65
+      Dice: 82.7
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.986
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821-f75705a9.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 89.89
+      Dice: 81.22
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.982
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare_20201227_181818-3c2923c4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare-20201227_181818.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 90.72
+      Dice: 82.84
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.028
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823-f1063ef7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 89.73
+      Dice: 80.93
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.999
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare_20201226_094047-93dcb93c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare-20201226_094047.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: STARE
+    Metrics:
+      mDice: 90.65
+      Dice: 82.71
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
+  Metadata:
+    Training Data: STARE
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.01
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825-21db614c.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.46
+      Dice: 80.24
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.968
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_chase_db1/fcn_unet_s5-d16_128x128_40k_chase_db1_20201223_191051-11543527.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_128x128_40k_chase_db1/unet_s5-d16_128x128_40k_chase_db1-20201223_191051.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.52
+      Dice: 80.4
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.986
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821-1c4eb7cf.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.52
+      Dice: 80.36
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.982
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1_20201227_181818-68d4e609.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1-20201227_181818.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.45
+      Dice: 80.28
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.028
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823-c0802c4d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.57
+      Dice: 80.47
+  Config: configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 0.999
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1_20201226_094047-4c5aefa3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1-20201226_094047.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: CHASE_DB1
+    Metrics:
+      mDice: 89.49
+      Dice: 80.37
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
+  Metadata:
+    Training Data: CHASE_DB1
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 1.01
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825-4ef29df5.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 88.92
+      Dice: 79.45
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.525
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_256x256_40k_hrf/fcn_unet_s5-d16_256x256_40k_hrf_20201223_173724-d89cf1ed.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/unet_s5-d16_256x256_40k_hrf/unet_s5-d16_256x256_40k_hrf-20201223_173724.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.64
+      Dice: 80.87
+  Config: configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - FCN
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.623
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821-c314da8a.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.24
+      Dice: 80.07
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.588
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf_20201227_181818-fdb7e29b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf-20201227_181818.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.69
+      Dice: 80.96
+  Config: configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - PSPNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.798
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823-53d492fa.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.32
+      Dice: 80.21
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.604
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf_20201226_094047-3a1fdf85.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf-20201226_094047.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
+- Name: unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256
+  In Collection: UNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: HRF
+    Metrics:
+      mDice: 89.56
+      Dice: 80.71
+  Config: configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
+  Metadata:
+    Training Data: HRF
+    Batch Size: 16
+    Architecture:
+    - UNet-S5-D16
+    - UNet
+    - DeepLabV3
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 2.607
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032-59daf7a4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032.log.json
+  Paper:
+    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
+    URL: https://arxiv.org/abs/1505.04597
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
+  Framework: PyTorch
diff --git a/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py b/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
deleted file mode 100644
index a63dc11d57..0000000000
--- a/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './pspnet_unet_s5-d16_128x128_40k_chase_db1.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py b/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
deleted file mode 100644
index 1a3b665821..0000000000
--- a/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './pspnet_unet_s5-d16_128x128_40k_stare.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py b/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
deleted file mode 100644
index e19d6cf427..0000000000
--- a/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './pspnet_unet_s5-d16_256x256_40k_hrf.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py b/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
deleted file mode 100644
index 7934923755..0000000000
--- a/configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './pspnet_unet_s5-d16_64x64_40k_drive.py'
-model = dict(
-    decode_head=dict(loss_decode=[
-        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
-    ]))
diff --git a/configs/unet/deeplabv3_unet_s5-d16_64x64_40k_drive.py b/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py
similarity index 100%
rename from configs/unet/deeplabv3_unet_s5-d16_64x64_40k_drive.py
rename to configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py
diff --git a/configs/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf.py b/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py
similarity index 100%
rename from configs/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf.py
rename to configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py
diff --git a/configs/unet/deeplabv3_unet_s5-d16_128x128_40k_stare.py b/configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py
similarity index 100%
rename from configs/unet/deeplabv3_unet_s5-d16_128x128_40k_stare.py
rename to configs/unet/unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py
diff --git a/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py b/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000..4f30bba9a7
--- /dev/null
+++ b/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py b/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
new file mode 100644
index 0000000000..823fc6dc51
--- /dev/null
+++ b/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_deeplabv3_4xb4-40k_drive-64x64.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py b/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
new file mode 100644
index 0000000000..174eaf8d93
--- /dev/null
+++ b/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_deeplabv3_4xb4-40k_hrf-256x256.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py b/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
new file mode 100644
index 0000000000..35972bea93
--- /dev/null
+++ b/configs/unet/unet-s5-d16_deeplabv3_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_deeplabv3_4xb4-40k_stare-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes.py b/configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py
similarity index 100%
rename from configs/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes.py
rename to configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py
diff --git a/configs/unet/fcn_unet_s5-d16_128x128_40k_chase_db1.py b/configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py
similarity index 100%
rename from configs/unet/fcn_unet_s5-d16_128x128_40k_chase_db1.py
rename to configs/unet/unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py
diff --git a/configs/unet/fcn_unet_s5-d16_64x64_40k_drive.py b/configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py
similarity index 100%
rename from configs/unet/fcn_unet_s5-d16_64x64_40k_drive.py
rename to configs/unet/unet-s5-d16_fcn_4xb4-40k_drive-64x64.py
diff --git a/configs/unet/fcn_unet_s5-d16_256x256_40k_hrf.py b/configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py
similarity index 100%
rename from configs/unet/fcn_unet_s5-d16_256x256_40k_hrf.py
rename to configs/unet/unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py
diff --git a/configs/unet/fcn_unet_s5-d16_128x128_40k_stare.py b/configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py
similarity index 100%
rename from configs/unet/fcn_unet_s5-d16_128x128_40k_stare.py
rename to configs/unet/unet-s5-d16_fcn_4xb4-40k_stare-128x128.py
diff --git a/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py b/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000..5a26ccbf96
--- /dev/null
+++ b/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_fcn_4xb4-40k_chase-db1-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py b/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
new file mode 100644
index 0000000000..c3b1488ad5
--- /dev/null
+++ b/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_fcn_4xb4-40k_drive-64x64.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py b/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
new file mode 100644
index 0000000000..dd3a6afc02
--- /dev/null
+++ b/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_fcn_4xb4-40k_hrf-256x256.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py b/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
new file mode 100644
index 0000000000..c8fecf34e9
--- /dev/null
+++ b/configs/unet/unet-s5-d16_fcn_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_fcn_4xb4-40k_stare-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1.py b/configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py
similarity index 100%
rename from configs/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1.py
rename to configs/unet/unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py
diff --git a/configs/unet/pspnet_unet_s5-d16_64x64_40k_drive.py b/configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py
similarity index 100%
rename from configs/unet/pspnet_unet_s5-d16_64x64_40k_drive.py
rename to configs/unet/unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py
diff --git a/configs/unet/pspnet_unet_s5-d16_256x256_40k_hrf.py b/configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py
similarity index 100%
rename from configs/unet/pspnet_unet_s5-d16_256x256_40k_hrf.py
rename to configs/unet/unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py
diff --git a/configs/unet/pspnet_unet_s5-d16_128x128_40k_stare.py b/configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py
similarity index 100%
rename from configs/unet/pspnet_unet_s5-d16_128x128_40k_stare.py
rename to configs/unet/unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py
diff --git a/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py b/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
new file mode 100644
index 0000000000..69a4bbaf82
--- /dev/null
+++ b/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_chase-db1-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_pspnet_4xb4-40k_chase-db1-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py b/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
new file mode 100644
index 0000000000..1abbd53d8c
--- /dev/null
+++ b/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_drive-64x64.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_pspnet_4xb4-40k_drive-64x64.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py b/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
new file mode 100644
index 0000000000..b3256d759b
--- /dev/null
+++ b/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_hrf-256x256.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_pspnet_4xb4-40k_hrf-256x256.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py b/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
new file mode 100644
index 0000000000..82aa3da616
--- /dev/null
+++ b/configs/unet/unet-s5-d16_pspnet_4xb4-ce-1.0-dice-3.0-40k_stare-128x128.py
@@ -0,0 +1,6 @@
+_base_ = './unet-s5-d16_pspnet_4xb4-40k_stare-128x128.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]))
diff --git a/configs/unet/unet.yml b/configs/unet/unet.yml
deleted file mode 100644
index 5bb5014f81..0000000000
--- a/configs/unet/unet.yml
+++ /dev/null
@@ -1,377 +0,0 @@
-Collections:
-- Name: UNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - DRIVE
-    - STARE
-    - CHASE_DB1
-    - HRF
-  Paper:
-    URL: https://arxiv.org/abs/1505.04597
-    Title: 'U-Net: Convolutional Networks for Biomedical Image Segmentation'
-  README: configs/unet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/unet.py#L225
-    Version: v0.17.0
-  Converted From:
-    Code: http://lmb.informatik.uni-freiburg.de/people/ronneber/u-net
-Models:
-- Name: fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (512,1024)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 327.87
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 17.91
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 69.1
-      mIoU(ms+flip): 71.05
-  Config: configs/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes/fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204-6860854e.pth
-- Name: fcn_unet_s5-d16_64x64_40k_drive
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (64,64)
-    lr schd: 40000
-    Training Memory (GB): 0.68
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: DRIVE
-    Metrics:
-      Dice: 78.67
-  Config: configs/unet/fcn_unet_s5-d16_64x64_40k_drive.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_64x64_40k_drive/fcn_unet_s5-d16_64x64_40k_drive_20201223_191051-5daf6d3b.pth
-- Name: fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (64,64)
-    lr schd: 40000
-    Training Memory (GB): 0.582
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: DRIVE
-    Metrics:
-      Dice: 79.32
-  Config: configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/fcn_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201820-785de5c2.pth
-- Name: pspnet_unet_s5-d16_64x64_40k_drive
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (64,64)
-    lr schd: 40000
-    Training Memory (GB): 0.599
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: DRIVE
-    Metrics:
-      Dice: 78.62
-  Config: configs/unet/pspnet_unet_s5-d16_64x64_40k_drive.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_64x64_40k_drive/pspnet_unet_s5-d16_64x64_40k_drive_20201227_181818-aac73387.pth
-- Name: pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (64,64)
-    lr schd: 40000
-    Training Memory (GB): 0.585
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: DRIVE
-    Metrics:
-      Dice: 79.42
-  Config: configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/pspnet_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201821-22b3e3ba.pth
-- Name: deeplabv3_unet_s5-d16_64x64_40k_drive
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (64,64)
-    lr schd: 40000
-    Training Memory (GB): 0.596
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: DRIVE
-    Metrics:
-      Dice: 78.69
-  Config: configs/unet/deeplabv3_unet_s5-d16_64x64_40k_drive.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_64x64_40k_drive/deeplabv3_unet_s5-d16_64x64_40k_drive_20201226_094047-0671ff20.pth
-- Name: deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (64,64)
-    lr schd: 40000
-    Training Memory (GB): 0.582
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: DRIVE
-    Metrics:
-      Dice: 79.56
-  Config: configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_64x64_40k_drive_20211210_201825-6bf0efd7.pth
-- Name: fcn_unet_s5-d16_128x128_40k_stare
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 0.968
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: STARE
-    Metrics:
-      Dice: 81.02
-  Config: configs/unet/fcn_unet_s5-d16_128x128_40k_stare.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_stare/fcn_unet_s5-d16_128x128_40k_stare_20201223_191051-7d77e78b.pth
-- Name: fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 0.986
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: STARE
-    Metrics:
-      Dice: 82.7
-  Config: configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201821-f75705a9.pth
-- Name: pspnet_unet_s5-d16_128x128_40k_stare
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 0.982
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: STARE
-    Metrics:
-      Dice: 81.22
-  Config: configs/unet/pspnet_unet_s5-d16_128x128_40k_stare.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_stare/pspnet_unet_s5-d16_128x128_40k_stare_20201227_181818-3c2923c4.pth
-- Name: pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 1.028
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: STARE
-    Metrics:
-      Dice: 82.84
-  Config: configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201823-f1063ef7.pth
-- Name: deeplabv3_unet_s5-d16_128x128_40k_stare
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 0.999
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: STARE
-    Metrics:
-      Dice: 80.93
-  Config: configs/unet/deeplabv3_unet_s5-d16_128x128_40k_stare.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_stare/deeplabv3_unet_s5-d16_128x128_40k_stare_20201226_094047-93dcb93c.pth
-- Name: deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 1.01
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: STARE
-    Metrics:
-      Dice: 82.71
-  Config: configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_stare_20211210_201825-21db614c.pth
-- Name: fcn_unet_s5-d16_128x128_40k_chase_db1
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 0.968
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: CHASE_DB1
-    Metrics:
-      Dice: 80.24
-  Config: configs/unet/fcn_unet_s5-d16_128x128_40k_chase_db1.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_128x128_40k_chase_db1/fcn_unet_s5-d16_128x128_40k_chase_db1_20201223_191051-11543527.pth
-- Name: fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 0.986
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: CHASE_DB1
-    Metrics:
-      Dice: 80.4
-  Config: configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/fcn_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201821-1c4eb7cf.pth
-- Name: pspnet_unet_s5-d16_128x128_40k_chase_db1
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 0.982
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: CHASE_DB1
-    Metrics:
-      Dice: 80.36
-  Config: configs/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_128x128_40k_chase_db1/pspnet_unet_s5-d16_128x128_40k_chase_db1_20201227_181818-68d4e609.pth
-- Name: pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 1.028
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: CHASE_DB1
-    Metrics:
-      Dice: 80.28
-  Config: configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/pspnet_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201823-c0802c4d.pth
-- Name: deeplabv3_unet_s5-d16_128x128_40k_chase_db1
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 0.999
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: CHASE_DB1
-    Metrics:
-      Dice: 80.47
-  Config: configs/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1/deeplabv3_unet_s5-d16_128x128_40k_chase_db1_20201226_094047-4c5aefa3.pth
-- Name: deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (128,128)
-    lr schd: 40000
-    Training Memory (GB): 1.01
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: CHASE_DB1
-    Metrics:
-      Dice: 80.37
-  Config: configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_128x128_40k_chase-db1_20211210_201825-4ef29df5.pth
-- Name: fcn_unet_s5-d16_256x256_40k_hrf
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (256,256)
-    lr schd: 40000
-    Training Memory (GB): 2.525
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: HRF
-    Metrics:
-      Dice: 79.45
-  Config: configs/unet/fcn_unet_s5-d16_256x256_40k_hrf.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_256x256_40k_hrf/fcn_unet_s5-d16_256x256_40k_hrf_20201223_173724-d89cf1ed.pth
-- Name: fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (256,256)
-    lr schd: 40000
-    Training Memory (GB): 2.623
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: HRF
-    Metrics:
-      Dice: 80.87
-  Config: configs/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/fcn_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201821-c314da8a.pth
-- Name: pspnet_unet_s5-d16_256x256_40k_hrf
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (256,256)
-    lr schd: 40000
-    Training Memory (GB): 2.588
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: HRF
-    Metrics:
-      Dice: 80.07
-  Config: configs/unet/pspnet_unet_s5-d16_256x256_40k_hrf.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_256x256_40k_hrf/pspnet_unet_s5-d16_256x256_40k_hrf_20201227_181818-fdb7e29b.pth
-- Name: pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (256,256)
-    lr schd: 40000
-    Training Memory (GB): 2.798
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: HRF
-    Metrics:
-      Dice: 80.96
-  Config: configs/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/pspnet_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_201823-53d492fa.pth
-- Name: deeplabv3_unet_s5-d16_256x256_40k_hrf
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (256,256)
-    lr schd: 40000
-    Training Memory (GB): 2.604
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: HRF
-    Metrics:
-      Dice: 80.21
-  Config: configs/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_256x256_40k_hrf/deeplabv3_unet_s5-d16_256x256_40k_hrf_20201226_094047-3a1fdf85.pth
-- Name: deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf
-  In Collection: UNet
-  Metadata:
-    backbone: UNet-S5-D16
-    crop size: (256,256)
-    lr schd: 40000
-    Training Memory (GB): 2.607
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: HRF
-    Metrics:
-      Dice: 80.71
-  Config: configs/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/unet/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032-59daf7a4.pth
diff --git a/configs/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1.py b/configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py
similarity index 100%
rename from configs/unet/deeplabv3_unet_s5-d16_128x128_40k_chase_db1.py
rename to configs/unet/unet_s5-d16_deeplabv3_4xb4-40k_chase-db1-128x128.py
diff --git a/configs/upernet/README.md b/configs/upernet/README.md
index dc8eadc6c6..c2babbd2a7 100644
--- a/configs/upernet/README.md
+++ b/configs/upernet/README.md
@@ -1,6 +1,6 @@
 # UPerNet
 
-[Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/pdf/1807.10221.pdf)
+> [Unified Perceptual Parsing for Scene Understanding](https://arxiv.org/pdf/1807.10221.pdf)
 
 ## Introduction
 
@@ -22,6 +22,39 @@ Humans recognize the visual world at multiple levels: we effortlessly categorize
 <img src="https://user-images.githubusercontent.com/24582831/142903077-44e8e0da-7276-4bda-bd2b-0df1680ca845.png" width="70%"/>
 </div>
 
+## Results and models
+
+### Cityscapes
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                                           |
+| ------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | R-50     | 512x1024  |   40000 | 6.4      | 4.25           | V100   | 77.10 |         78.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827-aa54cb54.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827.log.json)     |
+| UPerNet | R-101    | 512x1024  |   40000 | 7.4      | 3.79           | V100   | 78.69 |         80.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933-ebce3b10.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933.log.json) |
+| UPerNet | R-50     | 769x769   |   40000 | 7.2      | 1.76           | V100   | 77.98 |         79.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048-92d21539.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048.log.json)         |
+| UPerNet | R-101    | 769x769   |   40000 | 8.4      | 1.56           | V100   | 79.03 |         80.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819-83c95d01.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819.log.json)     |
+| UPerNet | R-50     | 512x1024  |   80000 | -        | -              | V100   | 78.19 |         79.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207-848beca8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207.log.json)     |
+| UPerNet | R-101    | 512x1024  |   80000 | -        | -              | V100   | 79.40 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403-f05f2345.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403.log.json) |
+| UPerNet | R-50     | 769x769   |   80000 | -        | -              | V100   | 79.39 |         80.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107-82ae7d15.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107.log.json)         |
+| UPerNet | R-101    | 769x769   |   80000 | -        | -              | V100   | 80.10 |         81.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014-082fc334.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014.log.json)     |
+
+### ADE20K
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                    | download                                                                                                                                                                                                                                                                                                                           |
+| ------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | R-50     | 512x512   |   80000 | 8.1      | 23.40          | V100   | 40.70 |         41.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127-ecc8377b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127.log.json)         |
+| UPerNet | R-101    | 512x512   |   80000 | 9.1      | 20.34          | V100   | 42.91 |         43.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117-32e4db94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117.log.json)     |
+| UPerNet | R-50     | 512x512   |  160000 | -        | -              | V100   | 42.05 |         42.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328-8534de8d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328.log.json)     |
+| UPerNet | R-101    | 512x512   |  160000 | -        | -              | V100   | 43.82 |         44.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951-91b32684.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951.log.json) |
+
+### Pascal VOC 2012 + Aug
+
+| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                               |
+| ------- | -------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | R-50     | 512x512   |   20000 | 6.4      | 23.17          | V100   | 74.82 |         76.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330-5b5890a7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330.log.json)     |
+| UPerNet | R-101    | 512x512   |   20000 | 7.5      | 19.98          | V100   | 77.10 |         78.29 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629-f14e7f27.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629.log.json) |
+| UPerNet | R-50     | 512x512   |   40000 | -        | -              | V100   | 75.92 |         77.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257-ca9bcc6b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257.log.json)     |
+| UPerNet | R-101    | 512x512   |   40000 | -        | -              | V100   | 77.43 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549-e26476ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549.log.json) |
+
 ## Citation
 
 ```bibtex
@@ -33,36 +66,3 @@ Humans recognize the visual world at multiple levels: we effortlessly categorize
   year={2018}
 }
 ```
-
-## Results and models
-
-### Cityscapes
-
-| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                     | download                                                                                                                                                                                                                                                                                                                                           |
-| ------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | -------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UPerNet | R-50     | 512x1024  |   40000 | 6.4      | 4.25           | 77.10 |         78.37 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r50_512x1024_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827-aa54cb54.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827.log.json)     |
-| UPerNet | R-101    | 512x1024  |   40000 | 7.4      | 3.79           | 78.69 |         80.11 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r101_512x1024_40k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933-ebce3b10.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933.log.json) |
-| UPerNet | R-50     | 769x769   |   40000 | 7.2      | 1.76           | 77.98 |         79.70 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r50_769x769_40k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048-92d21539.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048.log.json)         |
-| UPerNet | R-101    | 769x769   |   40000 | 8.4      | 1.56           | 79.03 |         80.77 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r101_769x769_40k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819-83c95d01.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819.log.json)     |
-| UPerNet | R-50     | 512x1024  |   80000 | -        | -              | 78.19 |         79.19 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r50_512x1024_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207-848beca8.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207.log.json)     |
-| UPerNet | R-101    | 512x1024  |   80000 | -        | -              | 79.40 |         80.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r101_512x1024_80k_cityscapes.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403-f05f2345.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403.log.json) |
-| UPerNet | R-50     | 769x769   |   80000 | -        | -              | 79.39 |         80.92 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r50_769x769_80k_cityscapes.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107-82ae7d15.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107.log.json)         |
-| UPerNet | R-101    | 769x769   |   80000 | -        | -              | 80.10 |         81.49 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r101_769x769_80k_cityscapes.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014-082fc334.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014.log.json)     |
-
-### ADE20K
-
-| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                 | download                                                                                                                                                                                                                                                                                                                           |
-| ------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ---------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UPerNet | R-50     | 512x512   |   80000 | 8.1      | 23.40          | 40.70 |         41.81 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r50_512x512_80k_ade20k.py)   | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127-ecc8377b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127.log.json)         |
-| UPerNet | R-101    | 512x512   |   80000 | 9.1      | 20.34          | 42.91 |         43.96 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r101_512x512_80k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117-32e4db94.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117.log.json)     |
-| UPerNet | R-50     | 512x512   |  160000 | -        | -              | 42.05 |         42.78 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r50_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328-8534de8d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328.log.json)     |
-| UPerNet | R-101    | 512x512   |  160000 | -        | -              | 43.82 |         44.85 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r101_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951-91b32684.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951.log.json) |
-
-### Pascal VOC 2012 + Aug
-
-| Method  | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                  | download                                                                                                                                                                                                                                                                                                                               |
-| ------- | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UPerNet | R-50     | 512x512   |   20000 | 6.4      | 23.17          | 74.82 |         76.35 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r50_512x512_20k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330-5b5890a7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330.log.json)     |
-| UPerNet | R-101    | 512x512   |   20000 | 7.5      | 19.98          | 77.10 |         78.29 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r101_512x512_20k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629-f14e7f27.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629.log.json) |
-| UPerNet | R-50     | 512x512   |   40000 | -        | -              | 75.92 |         77.44 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r50_512x512_40k_voc12aug.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257-ca9bcc6b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257.log.json)     |
-| UPerNet | R-101    | 512x512   |   40000 | -        | -              | 77.43 |         78.56 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet/upernet_r101_512x512_40k_voc12aug.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549-e26476ac.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549.log.json) |
diff --git a/configs/upernet/metafile.yaml b/configs/upernet/metafile.yaml
new file mode 100644
index 0000000000..f6ad8187f2
--- /dev/null
+++ b/configs/upernet/metafile.yaml
@@ -0,0 +1,391 @@
+Collections:
+- Name: UPerNet
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - Cityscapes
+    - ADE20K
+    - Pascal VOC 2012 + Aug
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  README: configs/upernet/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: upernet_r50_4xb2-40k_cityscapes-512x1024
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.1
+      mIoU(ms+flip): 78.37
+  Config: configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827-aa54cb54.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb2-40k_cityscapes-512x1024
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.69
+      mIoU(ms+flip): 80.11
+  Config: configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933-ebce3b10.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb2-40k_cityscapes-769x769
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 77.98
+      mIoU(ms+flip): 79.7
+  Config: configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048-92d21539.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb2-40k_cityscapes-769x769
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.03
+      mIoU(ms+flip): 80.77
+  Config: configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819-83c95d01.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb2-80k_cityscapes-512x1024
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 78.19
+      mIoU(ms+flip): 79.19
+  Config: configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207-848beca8.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb2-80k_cityscapes-512x1024
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.4
+      mIoU(ms+flip): 80.46
+  Config: configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403-f05f2345.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb2-80k_cityscapes-769x769
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 79.39
+      mIoU(ms+flip): 80.92
+  Config: configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107-82ae7d15.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb2-80k_cityscapes-769x769
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Cityscapes
+    Metrics:
+      mIoU: 80.1
+      mIoU(ms+flip): 81.49
+  Config: configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py
+  Metadata:
+    Training Data: Cityscapes
+    Batch Size: 8
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014-082fc334.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb4-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 40.7
+      mIoU(ms+flip): 41.81
+  Config: configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 8.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127-ecc8377b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb4-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.91
+      mIoU(ms+flip): 43.96
+  Config: configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 9.1
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117-32e4db94.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb4-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.05
+      mIoU(ms+flip): 42.78
+  Config: configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328-8534de8d.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb4-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.82
+      mIoU(ms+flip): 44.85
+  Config: configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951-91b32684.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb4-20k_voc12aug-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 74.82
+      mIoU(ms+flip): 76.35
+  Config: configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 6.4
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330-5b5890a7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb4-20k_voc12aug-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.1
+      mIoU(ms+flip): 78.29
+  Config: configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+    Memory (GB): 7.5
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629-f14e7f27.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r50_4xb4-40k_voc12aug-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 75.92
+      mIoU(ms+flip): 77.44
+  Config: configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-50
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257-ca9bcc6b.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
+- Name: upernet_r101_4xb4-40k_voc12aug-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: Pascal VOC 2012 + Aug
+    Metrics:
+      mIoU: 77.43
+      mIoU(ms+flip): 78.56
+  Config: configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py
+  Metadata:
+    Training Data: Pascal VOC 2012 + Aug
+    Batch Size: 16
+    Architecture:
+    - R-101
+    - UPerNet
+    Training Resources: 4x V100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549-e26476ac.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549.log.json
+  Paper:
+    Title: Unified Perceptual Parsing for Scene Understanding
+    URL: https://arxiv.org/pdf/1807.10221.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
+  Framework: PyTorch
diff --git a/configs/upernet/upernet.yml b/configs/upernet/upernet.yml
deleted file mode 100644
index 7c3872a8dd..0000000000
--- a/configs/upernet/upernet.yml
+++ /dev/null
@@ -1,305 +0,0 @@
-Collections:
-- Name: UPerNet
-  Metadata:
-    Training Data:
-    - Cityscapes
-    - ADE20K
-    - Pascal VOC 2012 + Aug
-  Paper:
-    URL: https://arxiv.org/pdf/1807.10221.pdf
-    Title: Unified Perceptual Parsing for Scene Understanding
-  README: configs/upernet/README.md
-  Code:
-    URL: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/decode_heads/uper_head.py#L13
-    Version: v0.17.0
-  Converted From:
-    Code: https://github.com/CSAILVision/unifiedparsing
-Models:
-- Name: upernet_r50_512x1024_40k_cityscapes
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-50
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 235.29
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 6.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.1
-      mIoU(ms+flip): 78.37
-  Config: configs/upernet/upernet_r50_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_40k_cityscapes/upernet_r50_512x1024_40k_cityscapes_20200605_094827-aa54cb54.pth
-- Name: upernet_r101_512x1024_40k_cityscapes
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-101
-    crop size: (512,1024)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 263.85
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,1024)
-    Training Memory (GB): 7.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.69
-      mIoU(ms+flip): 80.11
-  Config: configs/upernet/upernet_r101_512x1024_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_40k_cityscapes/upernet_r101_512x1024_40k_cityscapes_20200605_094933-ebce3b10.pth
-- Name: upernet_r50_769x769_40k_cityscapes
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-50
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 568.18
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 7.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 77.98
-      mIoU(ms+flip): 79.7
-  Config: configs/upernet/upernet_r50_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_40k_cityscapes/upernet_r50_769x769_40k_cityscapes_20200530_033048-92d21539.pth
-- Name: upernet_r101_769x769_40k_cityscapes
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-101
-    crop size: (769,769)
-    lr schd: 40000
-    inference time (ms/im):
-    - value: 641.03
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (769,769)
-    Training Memory (GB): 8.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.03
-      mIoU(ms+flip): 80.77
-  Config: configs/upernet/upernet_r101_769x769_40k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_40k_cityscapes/upernet_r101_769x769_40k_cityscapes_20200530_040819-83c95d01.pth
-- Name: upernet_r50_512x1024_80k_cityscapes
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-50
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 78.19
-      mIoU(ms+flip): 79.19
-  Config: configs/upernet/upernet_r50_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x1024_80k_cityscapes/upernet_r50_512x1024_80k_cityscapes_20200607_052207-848beca8.pth
-- Name: upernet_r101_512x1024_80k_cityscapes
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-101
-    crop size: (512,1024)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.4
-      mIoU(ms+flip): 80.46
-  Config: configs/upernet/upernet_r101_512x1024_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x1024_80k_cityscapes/upernet_r101_512x1024_80k_cityscapes_20200607_002403-f05f2345.pth
-- Name: upernet_r50_769x769_80k_cityscapes
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-50
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 79.39
-      mIoU(ms+flip): 80.92
-  Config: configs/upernet/upernet_r50_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_769x769_80k_cityscapes/upernet_r50_769x769_80k_cityscapes_20200607_005107-82ae7d15.pth
-- Name: upernet_r101_769x769_80k_cityscapes
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-101
-    crop size: (769,769)
-    lr schd: 80000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Cityscapes
-    Metrics:
-      mIoU: 80.1
-      mIoU(ms+flip): 81.49
-  Config: configs/upernet/upernet_r101_769x769_80k_cityscapes.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_769x769_80k_cityscapes/upernet_r101_769x769_80k_cityscapes_20200607_001014-082fc334.pth
-- Name: upernet_r50_512x512_80k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-50
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 42.74
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 8.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 40.7
-      mIoU(ms+flip): 41.81
-  Config: configs/upernet/upernet_r50_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_80k_ade20k/upernet_r50_512x512_80k_ade20k_20200614_144127-ecc8377b.pth
-- Name: upernet_r101_512x512_80k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-101
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 49.16
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.1
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.91
-      mIoU(ms+flip): 43.96
-  Config: configs/upernet/upernet_r101_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_80k_ade20k/upernet_r101_512x512_80k_ade20k_20200614_185117-32e4db94.pth
-- Name: upernet_r50_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-50
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.05
-      mIoU(ms+flip): 42.78
-  Config: configs/upernet/upernet_r50_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_160k_ade20k/upernet_r50_512x512_160k_ade20k_20200615_184328-8534de8d.pth
-- Name: upernet_r101_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-101
-    crop size: (512,512)
-    lr schd: 160000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.82
-      mIoU(ms+flip): 44.85
-  Config: configs/upernet/upernet_r101_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_160k_ade20k/upernet_r101_512x512_160k_ade20k_20200615_161951-91b32684.pth
-- Name: upernet_r50_512x512_20k_voc12aug
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-50
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 43.16
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 6.4
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 74.82
-      mIoU(ms+flip): 76.35
-  Config: configs/upernet/upernet_r50_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_20k_voc12aug/upernet_r50_512x512_20k_voc12aug_20200617_165330-5b5890a7.pth
-- Name: upernet_r101_512x512_20k_voc12aug
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-101
-    crop size: (512,512)
-    lr schd: 20000
-    inference time (ms/im):
-    - value: 50.05
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.5
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.1
-      mIoU(ms+flip): 78.29
-  Config: configs/upernet/upernet_r101_512x512_20k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_20k_voc12aug/upernet_r101_512x512_20k_voc12aug_20200617_165629-f14e7f27.pth
-- Name: upernet_r50_512x512_40k_voc12aug
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-50
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 75.92
-      mIoU(ms+flip): 77.44
-  Config: configs/upernet/upernet_r50_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r50_512x512_40k_voc12aug/upernet_r50_512x512_40k_voc12aug_20200613_162257-ca9bcc6b.pth
-- Name: upernet_r101_512x512_40k_voc12aug
-  In Collection: UPerNet
-  Metadata:
-    backbone: R-101
-    crop size: (512,512)
-    lr schd: 40000
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: Pascal VOC 2012 + Aug
-    Metrics:
-      mIoU: 77.43
-      mIoU(ms+flip): 78.56
-  Config: configs/upernet/upernet_r101_512x512_40k_voc12aug.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/upernet/upernet_r101_512x512_40k_voc12aug/upernet_r101_512x512_40k_voc12aug_20200613_163549-e26476ac.pth
diff --git a/configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py b/configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..8f5f6aecfe
--- /dev/null
+++ b/configs/upernet/upernet_r101_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb2-40k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py b/configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py
new file mode 100644
index 0000000000..28b5d3e968
--- /dev/null
+++ b/configs/upernet/upernet_r101_4xb2-40k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb2-40k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py b/configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..cafd8a2091
--- /dev/null
+++ b/configs/upernet/upernet_r101_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb2-80k_cityscapes-512x1024.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py b/configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py
new file mode 100644
index 0000000000..e17572054f
--- /dev/null
+++ b/configs/upernet/upernet_r101_4xb2-80k_cityscapes-769x769.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb2-80k_cityscapes-769x769.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py b/configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..7a6152774c
--- /dev/null
+++ b/configs/upernet/upernet_r101_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb4-160k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py b/configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..be8f0848df
--- /dev/null
+++ b/configs/upernet/upernet_r101_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb4-20k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py b/configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..db1d976498
--- /dev/null
+++ b/configs/upernet/upernet_r101_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb4-40k_voc12aug-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py b/configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..84549a421d
--- /dev/null
+++ b/configs/upernet/upernet_r101_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,2 @@
+_base_ = './upernet_r50_4xb4-80k_ade20k-512x512.py'
+model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_512x1024_40k_cityscapes.py b/configs/upernet/upernet_r101_512x1024_40k_cityscapes.py
deleted file mode 100644
index b90b597d83..0000000000
--- a/configs/upernet/upernet_r101_512x1024_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './upernet_r50_512x1024_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_512x1024_80k_cityscapes.py b/configs/upernet/upernet_r101_512x1024_80k_cityscapes.py
deleted file mode 100644
index 420ca2e428..0000000000
--- a/configs/upernet/upernet_r101_512x1024_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './upernet_r50_512x1024_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_512x512_160k_ade20k.py b/configs/upernet/upernet_r101_512x512_160k_ade20k.py
deleted file mode 100644
index 146f13eb79..0000000000
--- a/configs/upernet/upernet_r101_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './upernet_r50_512x512_160k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_512x512_20k_voc12aug.py b/configs/upernet/upernet_r101_512x512_20k_voc12aug.py
deleted file mode 100644
index 56345d1806..0000000000
--- a/configs/upernet/upernet_r101_512x512_20k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './upernet_r50_512x512_20k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_512x512_40k_voc12aug.py b/configs/upernet/upernet_r101_512x512_40k_voc12aug.py
deleted file mode 100644
index 0669b741b9..0000000000
--- a/configs/upernet/upernet_r101_512x512_40k_voc12aug.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './upernet_r50_512x512_40k_voc12aug.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_512x512_80k_ade20k.py b/configs/upernet/upernet_r101_512x512_80k_ade20k.py
deleted file mode 100644
index abfb9c5d9f..0000000000
--- a/configs/upernet/upernet_r101_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './upernet_r50_512x512_80k_ade20k.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_769x769_40k_cityscapes.py b/configs/upernet/upernet_r101_769x769_40k_cityscapes.py
deleted file mode 100644
index e5f3a3fae1..0000000000
--- a/configs/upernet/upernet_r101_769x769_40k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './upernet_r50_769x769_40k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r101_769x769_80k_cityscapes.py b/configs/upernet/upernet_r101_769x769_80k_cityscapes.py
deleted file mode 100644
index a709165657..0000000000
--- a/configs/upernet/upernet_r101_769x769_80k_cityscapes.py
+++ /dev/null
@@ -1,2 +0,0 @@
-_base_ = './upernet_r50_769x769_80k_cityscapes.py'
-model = dict(pretrained='open-mmlab://resnet101_v1c', backbone=dict(depth=101))
diff --git a/configs/upernet/upernet_r18_4xb2-40k_cityscapes-512x1024.py b/configs/upernet/upernet_r18_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..dbff0e75a1
--- /dev/null
+++ b/configs/upernet/upernet_r18_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+_base_ = './upernet_r50_4xb2-40k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512]),
+    auxiliary_head=dict(in_channels=256))
diff --git a/configs/upernet/upernet_r18_4xb2-80k_cityscapes-512x1024.py b/configs/upernet/upernet_r18_4xb2-80k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..dee6349f64
--- /dev/null
+++ b/configs/upernet/upernet_r18_4xb2-80k_cityscapes-512x1024.py
@@ -0,0 +1,6 @@
+_base_ = './upernet_r50_4xb2-80k_cityscapes-512x1024.py'
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512]),
+    auxiliary_head=dict(in_channels=256))
diff --git a/configs/upernet/upernet_r18_4xb4-160k_ade20k-512x512.py b/configs/upernet/upernet_r18_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..9ac6c35527
--- /dev/null
+++ b/configs/upernet/upernet_r18_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_160k.py'
+]
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512], num_classes=150),
+    auxiliary_head=dict(in_channels=256, num_classes=150))
diff --git a/configs/upernet/upernet_r18_4xb4-20k_voc12aug-512x512.py b/configs/upernet/upernet_r18_4xb4-20k_voc12aug-512x512.py
new file mode 100644
index 0000000000..5cae4f5435
--- /dev/null
+++ b/configs/upernet/upernet_r18_4xb4-20k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_20k.py'
+]
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512], num_classes=21),
+    auxiliary_head=dict(in_channels=256, num_classes=21))
diff --git a/configs/upernet/upernet_r18_4xb4-40k_voc12aug-512x512.py b/configs/upernet/upernet_r18_4xb4-40k_voc12aug-512x512.py
new file mode 100644
index 0000000000..652ded7516
--- /dev/null
+++ b/configs/upernet/upernet_r18_4xb4-40k_voc12aug-512x512.py
@@ -0,0 +1,10 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py',
+    '../_base_/datasets/pascal_voc12_aug.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_40k.py'
+]
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512], num_classes=21),
+    auxiliary_head=dict(in_channels=256, num_classes=21))
diff --git a/configs/upernet/upernet_r18_4xb4-80k_ade20k-512x512.py b/configs/upernet/upernet_r18_4xb4-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..1a7956d71f
--- /dev/null
+++ b/configs/upernet/upernet_r18_4xb4-80k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = [
+    '../_base_/models/upernet_r50.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+model = dict(
+    pretrained='open-mmlab://resnet18_v1c',
+    backbone=dict(depth=18),
+    decode_head=dict(in_channels=[64, 128, 256, 512], num_classes=150),
+    auxiliary_head=dict(in_channels=256, num_classes=150))
diff --git a/configs/upernet/upernet_r50_512x1024_40k_cityscapes.py b/configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py
similarity index 100%
rename from configs/upernet/upernet_r50_512x1024_40k_cityscapes.py
rename to configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py
diff --git a/configs/upernet/upernet_r50_769x769_40k_cityscapes.py b/configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py
similarity index 100%
rename from configs/upernet/upernet_r50_769x769_40k_cityscapes.py
rename to configs/upernet/upernet_r50_4xb2-40k_cityscapes-769x769.py
diff --git a/configs/upernet/upernet_r50_512x1024_80k_cityscapes.py b/configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py
similarity index 100%
rename from configs/upernet/upernet_r50_512x1024_80k_cityscapes.py
rename to configs/upernet/upernet_r50_4xb2-80k_cityscapes-512x1024.py
diff --git a/configs/upernet/upernet_r50_769x769_80k_cityscapes.py b/configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py
similarity index 100%
rename from configs/upernet/upernet_r50_769x769_80k_cityscapes.py
rename to configs/upernet/upernet_r50_4xb2-80k_cityscapes-769x769.py
diff --git a/configs/upernet/upernet_r50_512x512_160k_ade20k.py b/configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py
similarity index 100%
rename from configs/upernet/upernet_r50_512x512_160k_ade20k.py
rename to configs/upernet/upernet_r50_4xb4-160k_ade20k-512x512.py
diff --git a/configs/upernet/upernet_r50_512x512_20k_voc12aug.py b/configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py
similarity index 100%
rename from configs/upernet/upernet_r50_512x512_20k_voc12aug.py
rename to configs/upernet/upernet_r50_4xb4-20k_voc12aug-512x512.py
diff --git a/configs/upernet/upernet_r50_512x512_40k_voc12aug.py b/configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py
similarity index 100%
rename from configs/upernet/upernet_r50_512x512_40k_voc12aug.py
rename to configs/upernet/upernet_r50_4xb4-40k_voc12aug-512x512.py
diff --git a/configs/upernet/upernet_r50_512x512_80k_ade20k.py b/configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py
similarity index 100%
rename from configs/upernet/upernet_r50_512x512_80k_ade20k.py
rename to configs/upernet/upernet_r50_4xb4-80k_ade20k-512x512.py
diff --git a/configs/vit/README.md b/configs/vit/README.md
index bfa20f4225..f75326e8e4 100644
--- a/configs/vit/README.md
+++ b/configs/vit/README.md
@@ -1,6 +1,6 @@
 # Vision Transformer
 
-[An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/pdf/2010.11929.pdf)
+> [An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/pdf/2010.11929.pdf)
 
 ## Introduction
 
@@ -22,17 +22,6 @@ While the Transformer architecture has become the de-facto standard for natural
 <img src="https://user-images.githubusercontent.com/24582831/142903144-f80a12cc-8698-48ab-843c-49dedf558121.png" width="70%"/>
 </div>
 
-## Citation
-
-```bibtex
-@article{dosoViTskiy2020,
-  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
-  author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
-  journal={arXiv preprint arXiv:2010.11929},
-  year={2020}
-}
-```
-
 ## Usage
 
 To use other repositories' pre-trained models, it is necessary to convert keys.
@@ -55,16 +44,27 @@ This script convert model from `PRETRAIN_PATH` and store the converted model in
 
 ### ADE20K
 
-| Method  | Backbone          | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                        | download                                                                                                                                                                                                                                                                                                                   |
-| ------- | ----------------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ----------------------------------------------------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| UPerNet | ViT-B + MLN       | 512x512   |   80000 | 9.20     | 6.94           | 47.71 |         49.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/upernet_vit-b16_mln_512x512_80k_ade20k_20210624_130547-0403cee1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/20210624_130547.log.json)                |
-| UPerNet | ViT-B + MLN       | 512x512   |  160000 | 9.20     | 7.58           | 46.75 |         48.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/upernet_vit-b16_mln_512x512_160k_ade20k_20210624_130547-852fa768.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/20210623_192432.log.json)             |
-| UPerNet | ViT-B + LN + MLN  | 512x512   |  160000 | 9.21     | 6.82           | 47.73 |         49.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/upernet_vit-b16_ln_mln_512x512_160k_ade20k_20210621_172828-f444c077.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/20210621_172828.log.json)    |
-| UPerNet | DeiT-S            | 512x512   |   80000 | 4.68     | 29.85          | 42.96 |         43.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/upernet_deit-s16_512x512_80k_ade20k_20210624_095228-afc93ec2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/20210624_095228.log.json)                         |
-| UPerNet | DeiT-S            | 512x512   |  160000 | 4.68     | 29.19          | 42.87 |         43.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/upernet_deit-s16_512x512_160k_ade20k_20210621_160903-5110d916.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/20210621_160903.log.json)                      |
-| UPerNet | DeiT-S + MLN      | 512x512   |  160000 | 5.69     | 11.18          | 43.82 |         45.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/upernet_deit-s16_mln_512x512_160k_ade20k_20210621_161021-fb9a5dfb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/20210621_161021.log.json)          |
-| UPerNet | DeiT-S + LN + MLN | 512x512   |  160000 | 5.69     | 12.39          | 43.52 |         45.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/upernet_deit-s16_ln_mln_512x512_160k_ade20k_20210621_161021-c0cd652f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/20210621_161021.log.json) |
-| UPerNet | DeiT-B            | 512x512   |   80000 | 7.75     | 9.69           | 45.24 |         46.73 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/upernet_deit-b16_512x512_80k_ade20k_20210624_130529-1e090789.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/20210624_130529.log.json)                         |
-| UPerNet | DeiT-B            | 512x512   |  160000 | 7.75     | 10.39          | 45.36 |         47.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/upernet_deit-b16_512x512_160k_ade20k_20210621_180100-828705d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/20210621_180100.log.json)                      |
-| UPerNet | DeiT-B + MLN      | 512x512   |  160000 | 9.21     | 7.78           | 45.46 |         47.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/upernet_deit-b16_mln_512x512_160k_ade20k_20210621_191949-4e1450f3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/20210621_191949.log.json)          |
-| UPerNet | DeiT-B + LN + MLN | 512x512   |  160000 | 9.21     | 7.75           | 45.37 |         47.23 | [config](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/upernet_deit-b16_ln_mln_512x512_160k_ade20k_20210623_153535-8a959c14.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/20210623_153535.log.json) |
+| Method  | Backbone          | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device |  mIoU | mIoU(ms+flip) | config                                                                                                                               | download                                                                                                                                                                                                                                                                                                                   |
+| ------- | ----------------- | --------- | ------: | -------- | -------------- | ------ | ----: | ------------: | ------------------------------------------------------------------------------------------------------------------------------------ | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| UPerNet | ViT-B + MLN       | 512x512   |   80000 | 9.20     | 6.94           | V100   | 47.71 |         49.51 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py)      | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/upernet_vit-b16_mln_512x512_80k_ade20k_20210624_130547-0403cee1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/20210624_130547.log.json)                |
+| UPerNet | ViT-B + MLN       | 512x512   |  160000 | 9.20     | 7.58           | V100   | 46.75 |         48.46 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py)     | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/upernet_vit-b16_mln_512x512_160k_ade20k_20210624_130547-852fa768.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/20210623_192432.log.json)             |
+| UPerNet | ViT-B + LN + MLN  | 512x512   |  160000 | 9.21     | 6.82           | V100   | 47.73 |         49.95 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py)  | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/upernet_vit-b16_ln_mln_512x512_160k_ade20k_20210621_172828-f444c077.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/20210621_172828.log.json)    |
+| UPerNet | DeiT-S            | 512x512   |   80000 | 4.68     | 29.85          | V100   | 42.96 |         43.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/upernet_deit-s16_512x512_80k_ade20k_20210624_095228-afc93ec2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/20210624_095228.log.json)                         |
+| UPerNet | DeiT-S            | 512x512   |  160000 | 4.68     | 29.19          | V100   | 42.87 |         43.79 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/upernet_deit-s16_512x512_160k_ade20k_20210621_160903-5110d916.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/20210621_160903.log.json)                      |
+| UPerNet | DeiT-S + MLN      | 512x512   |  160000 | 5.69     | 11.18          | V100   | 43.82 |         45.07 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/upernet_deit-s16_mln_512x512_160k_ade20k_20210621_161021-fb9a5dfb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/20210621_161021.log.json)          |
+| UPerNet | DeiT-S + LN + MLN | 512x512   |  160000 | 5.69     | 12.39          | V100   | 43.52 |         45.01 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/upernet_deit-s16_ln_mln_512x512_160k_ade20k_20210621_161021-c0cd652f.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/20210621_161021.log.json) |
+| UPerNet | DeiT-B            | 512x512   |   80000 | 7.75     | 9.69           | V100   | 45.24 |         46.73 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py)         | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/upernet_deit-b16_512x512_80k_ade20k_20210624_130529-1e090789.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/20210624_130529.log.json)                         |
+| UPerNet | DeiT-B            | 512x512   |  160000 | 7.75     | 10.39          | V100   | 45.36 |         47.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/upernet_deit-b16_512x512_160k_ade20k_20210621_180100-828705d7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/20210621_180100.log.json)                      |
+| UPerNet | DeiT-B + MLN      | 512x512   |  160000 | 9.21     | 7.78           | V100   | 45.46 |         47.16 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py)    | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/upernet_deit-b16_mln_512x512_160k_ade20k_20210621_191949-4e1450f3.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/20210621_191949.log.json)          |
+| UPerNet | DeiT-B + LN + MLN | 512x512   |  160000 | 9.21     | 7.75           | V100   | 45.37 |         47.23 | [config](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/upernet_deit-b16_ln_mln_512x512_160k_ade20k_20210623_153535-8a959c14.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/20210623_153535.log.json) |
+
+## Citation
+
+```bibtex
+@article{dosoViTskiy2020,
+  title={An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale},
+  author={DosoViTskiy, Alexey and Beyer, Lucas and Kolesnikov, Alexander and Weissenborn, Dirk and Zhai, Xiaohua and Unterthiner, Thomas and  Dehghani, Mostafa and Minderer, Matthias and Heigold, Georg and Gelly, Sylvain and Uszkoreit, Jakob and Houlsby, Neil},
+  journal={arXiv preprint arXiv:2010.11929},
+  year={2020}
+}
+```
diff --git a/configs/vit/metafile.yaml b/configs/vit/metafile.yaml
new file mode 100644
index 0000000000..68e254a5f9
--- /dev/null
+++ b/configs/vit/metafile.yaml
@@ -0,0 +1,265 @@
+Models:
+- Name: vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.71
+      mIoU(ms+flip): 49.51
+  Config: configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/upernet_vit-b16_mln_512x512_80k_ade20k_20210624_130547-0403cee1.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/20210624_130547.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 46.75
+      mIoU(ms+flip): 48.46
+  Config: configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.2
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/upernet_vit-b16_mln_512x512_160k_ade20k_20210624_130547-852fa768.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/20210623_192432.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 47.73
+      mIoU(ms+flip): 49.95
+  Config: configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - ViT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.21
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/upernet_vit-b16_ln_mln_512x512_160k_ade20k_20210621_172828-f444c077.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/20210621_172828.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-s16_upernet_8xb2-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.96
+      mIoU(ms+flip): 43.79
+  Config: configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.68
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/upernet_deit-s16_512x512_80k_ade20k_20210624_095228-afc93ec2.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/20210624_095228.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-s16_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 42.87
+      mIoU(ms+flip): 43.79
+  Config: configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 4.68
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/upernet_deit-s16_512x512_160k_ade20k_20210621_160903-5110d916.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/20210621_160903.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.82
+      mIoU(ms+flip): 45.07
+  Config: configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.69
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/upernet_deit-s16_mln_512x512_160k_ade20k_20210621_161021-fb9a5dfb.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/20210621_161021.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 43.52
+      mIoU(ms+flip): 45.01
+  Config: configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-S
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 5.69
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/upernet_deit-s16_ln_mln_512x512_160k_ade20k_20210621_161021-c0cd652f.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/20210621_161021.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-b16_upernet_8xb2-80k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.24
+      mIoU(ms+flip): 46.73
+  Config: configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.75
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/upernet_deit-b16_512x512_80k_ade20k_20210624_130529-1e090789.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/20210624_130529.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-b16_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.36
+      mIoU(ms+flip): 47.16
+  Config: configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 7.75
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/upernet_deit-b16_512x512_160k_ade20k_20210621_180100-828705d7.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/20210621_180100.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.46
+      mIoU(ms+flip): 47.16
+  Config: configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.21
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/upernet_deit-b16_mln_512x512_160k_ade20k_20210621_191949-4e1450f3.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/20210621_191949.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
+- Name: vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512
+  In Collection: UPerNet
+  Results:
+    Task: Semantic Segmentation
+    Dataset: ADE20K
+    Metrics:
+      mIoU: 45.37
+      mIoU(ms+flip): 47.23
+  Config: configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
+  Metadata:
+    Training Data: ADE20K
+    Batch Size: 16
+    Architecture:
+    - DeiT-B
+    - UPerNet
+    Training Resources: 8x V100 GPUS
+    Memory (GB): 9.21
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/upernet_deit-b16_ln_mln_512x512_160k_ade20k_20210623_153535-8a959c14.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/20210623_153535.log.json
+  Paper:
+    Title: 'An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale'
+    URL: https://arxiv.org/pdf/2010.11929.pdf
+  Code: https://github.com/open-mmlab/mmsegmentation/blob/v0.17.0/mmseg/models/backbones/vit.py#L98
+  Framework: PyTorch
diff --git a/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py
deleted file mode 100644
index 68f4bd42ba..0000000000
--- a/configs/vit/upernet_deit-b16_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
-    backbone=dict(drop_path_rate=0.1),
-    neck=None)
diff --git a/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py b/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py
deleted file mode 100644
index 720482616d..0000000000
--- a/configs/vit/upernet_deit-b16_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './upernet_vit-b16_mln_512x512_80k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
-    backbone=dict(drop_path_rate=0.1),
-    neck=None)
diff --git a/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py
deleted file mode 100644
index 32909ffa13..0000000000
--- a/configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,5 +0,0 @@
-_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
-    backbone=dict(drop_path_rate=0.1, final_norm=True))
diff --git a/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py
deleted file mode 100644
index 4abefe8dc1..0000000000
--- a/configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
-    backbone=dict(drop_path_rate=0.1),
-)
diff --git a/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py
deleted file mode 100644
index 290ff19ed3..0000000000
--- a/configs/vit/upernet_deit-s16_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
-    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
-    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
-    neck=None,
-    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py b/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py
deleted file mode 100644
index 605d264a74..0000000000
--- a/configs/vit/upernet_deit-s16_512x512_80k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = './upernet_vit-b16_mln_512x512_80k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
-    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
-    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
-    neck=None,
-    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py
deleted file mode 100644
index ef743a20e0..0000000000
--- a/configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,9 +0,0 @@
-_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
-    backbone=dict(
-        num_heads=6, embed_dims=384, drop_path_rate=0.1, final_norm=True),
-    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
-    neck=dict(in_channels=[384, 384, 384, 384], out_channels=384),
-    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py b/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py
deleted file mode 100644
index 069cab74f6..0000000000
--- a/configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py
+++ /dev/null
@@ -1,8 +0,0 @@
-_base_ = './upernet_vit-b16_mln_512x512_160k_ade20k.py'
-
-model = dict(
-    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
-    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
-    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
-    neck=dict(in_channels=[384, 384, 384, 384], out_channels=384),
-    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/vit.yml b/configs/vit/vit.yml
deleted file mode 100644
index 35e4952e03..0000000000
--- a/configs/vit/vit.yml
+++ /dev/null
@@ -1,243 +0,0 @@
-Models:
-- Name: upernet_vit-b16_mln_512x512_80k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ViT-B + MLN
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 144.09
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 47.71
-      mIoU(ms+flip): 49.51
-  Config: configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_80k_ade20k/upernet_vit-b16_mln_512x512_80k_ade20k_20210624_130547-0403cee1.pth
-- Name: upernet_vit-b16_mln_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ViT-B + MLN
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 131.93
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.2
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 46.75
-      mIoU(ms+flip): 48.46
-  Config: configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_mln_512x512_160k_ade20k/upernet_vit-b16_mln_512x512_160k_ade20k_20210624_130547-852fa768.pth
-- Name: upernet_vit-b16_ln_mln_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: ViT-B + LN + MLN
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 146.63
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.21
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 47.73
-      mIoU(ms+flip): 49.95
-  Config: configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k/upernet_vit-b16_ln_mln_512x512_160k_ade20k_20210621_172828-f444c077.pth
-- Name: upernet_deit-s16_512x512_80k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: DeiT-S
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 33.5
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 4.68
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.96
-      mIoU(ms+flip): 43.79
-  Config: configs/vit/upernet_deit-s16_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_80k_ade20k/upernet_deit-s16_512x512_80k_ade20k_20210624_095228-afc93ec2.pth
-- Name: upernet_deit-s16_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: DeiT-S
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 34.26
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 4.68
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 42.87
-      mIoU(ms+flip): 43.79
-  Config: configs/vit/upernet_deit-s16_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_512x512_160k_ade20k/upernet_deit-s16_512x512_160k_ade20k_20210621_160903-5110d916.pth
-- Name: upernet_deit-s16_mln_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: DeiT-S + MLN
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 89.45
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.69
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.82
-      mIoU(ms+flip): 45.07
-  Config: configs/vit/upernet_deit-s16_mln_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_mln_512x512_160k_ade20k/upernet_deit-s16_mln_512x512_160k_ade20k_20210621_161021-fb9a5dfb.pth
-- Name: upernet_deit-s16_ln_mln_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: DeiT-S + LN + MLN
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 80.71
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 5.69
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 43.52
-      mIoU(ms+flip): 45.01
-  Config: configs/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-s16_ln_mln_512x512_160k_ade20k/upernet_deit-s16_ln_mln_512x512_160k_ade20k_20210621_161021-c0cd652f.pth
-- Name: upernet_deit-b16_512x512_80k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: DeiT-B
-    crop size: (512,512)
-    lr schd: 80000
-    inference time (ms/im):
-    - value: 103.2
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.75
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.24
-      mIoU(ms+flip): 46.73
-  Config: configs/vit/upernet_deit-b16_512x512_80k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_80k_ade20k/upernet_deit-b16_512x512_80k_ade20k_20210624_130529-1e090789.pth
-- Name: upernet_deit-b16_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: DeiT-B
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 96.25
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 7.75
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.36
-      mIoU(ms+flip): 47.16
-  Config: configs/vit/upernet_deit-b16_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_512x512_160k_ade20k/upernet_deit-b16_512x512_160k_ade20k_20210621_180100-828705d7.pth
-- Name: upernet_deit-b16_mln_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: DeiT-B + MLN
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 128.53
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.21
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.46
-      mIoU(ms+flip): 47.16
-  Config: configs/vit/upernet_deit-b16_mln_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_mln_512x512_160k_ade20k/upernet_deit-b16_mln_512x512_160k_ade20k_20210621_191949-4e1450f3.pth
-- Name: upernet_deit-b16_ln_mln_512x512_160k_ade20k
-  In Collection: UPerNet
-  Metadata:
-    backbone: DeiT-B + LN + MLN
-    crop size: (512,512)
-    lr schd: 160000
-    inference time (ms/im):
-    - value: 129.03
-      hardware: V100
-      backend: PyTorch
-      batch size: 1
-      mode: FP32
-      resolution: (512,512)
-    Training Memory (GB): 9.21
-  Results:
-  - Task: Semantic Segmentation
-    Dataset: ADE20K
-    Metrics:
-      mIoU: 45.37
-      mIoU(ms+flip): 47.23
-  Config: configs/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vit/upernet_deit-b16_ln_mln_512x512_160k_ade20k/upernet_deit-b16_ln_mln_512x512_160k_ade20k_20210623_153535-8a959c14.pth
diff --git a/configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py b/configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..39d1c54faf
--- /dev/null
+++ b/configs/vit/vit_deit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,5 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1, final_norm=True))
diff --git a/configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py b/configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..706673f6b1
--- /dev/null
+++ b/configs/vit/vit_deit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1),
+)
diff --git a/configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py b/configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..23a23582d7
--- /dev/null
+++ b/configs/vit/vit_deit-b16_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1),
+    neck=None)
diff --git a/configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py b/configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..4c8bc939ee
--- /dev/null
+++ b/configs/vit/vit_deit-b16_upernet_8xb2-80k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_base_patch16_224-b5f2ef4d.pth',
+    backbone=dict(drop_path_rate=0.1),
+    neck=None)
diff --git a/configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py b/configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..8e626fe0de
--- /dev/null
+++ b/configs/vit/vit_deit-s16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,9 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
+    backbone=dict(
+        num_heads=6, embed_dims=384, drop_path_rate=0.1, final_norm=True),
+    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
+    neck=dict(in_channels=[384, 384, 384, 384], out_channels=384),
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py b/configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..9a69a892b3
--- /dev/null
+++ b/configs/vit/vit_deit-s16_mln_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
+    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
+    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
+    neck=dict(in_channels=[384, 384, 384, 384], out_channels=384),
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py b/configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..9ef699d5d5
--- /dev/null
+++ b/configs/vit/vit_deit-s16_upernet_8xb2-160k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
+    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
+    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
+    neck=None,
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py b/configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py
new file mode 100644
index 0000000000..9ef699d5d5
--- /dev/null
+++ b/configs/vit/vit_deit-s16_upernet_8xb2-80k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = './vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py'
+
+model = dict(
+    pretrained='pretrain/deit_small_patch16_224-cd65a155.pth',
+    backbone=dict(num_heads=6, embed_dims=384, drop_path_rate=0.1),
+    decode_head=dict(num_classes=150, in_channels=[384, 384, 384, 384]),
+    neck=None,
+    auxiliary_head=dict(num_classes=150, in_channels=384))
diff --git a/configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py b/configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
similarity index 100%
rename from configs/vit/upernet_vit-b16_ln_mln_512x512_160k_ade20k.py
rename to configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py
diff --git a/configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py b/configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
similarity index 100%
rename from configs/vit/upernet_vit-b16_mln_512x512_160k_ade20k.py
rename to configs/vit/vit_vit-b16_mln_upernet_8xb2-160k_ade20k-512x512.py
diff --git a/configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py b/configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py
similarity index 100%
rename from configs/vit/upernet_vit-b16_mln_512x512_80k_ade20k.py
rename to configs/vit/vit_vit-b16_mln_upernet_8xb2-80k_ade20k-512x512.py
diff --git a/configs/vpd/README.md b/configs/vpd/README.md
new file mode 100644
index 0000000000..e90085bec9
--- /dev/null
+++ b/configs/vpd/README.md
@@ -0,0 +1,50 @@
+# VPD
+
+> [Unleashing Text-to-Image Diffusion Models for Visual Perception](https://arxiv.org/abs/2303.02153)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href = "https://github.com/wl-zhao/VPD">Official Repo</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Diffusion models (DMs) have become the new trend of generative models and have demonstrated a powerful ability of conditional synthesis. Among those, text-to-image diffusion models pre-trained on large-scale image-text pairs are highly controllable by customizable prompts. Unlike the unconditional generative models that focus on low-level attributes and details, text-to-image diffusion models contain more high-level knowledge thanks to the vision-language pre-training. In this paper, we propose VPD (Visual Perception with a pre-trained Diffusion model), a new framework that exploits the semantic information of a pre-trained text-to-image diffusion model in visual perception tasks. Instead of using the pre-trained denoising autoencoder in a diffusion-based pipeline, we simply use it as a backbone and aim to study how to take full advantage of the learned knowledge. Specifically, we prompt the denoising decoder with proper textual inputs and refine the text features with an adapter, leading to a better alignment to the pre-trained stage and making the visual contents interact with the text prompts. We also propose to utilize the cross-attention maps between the visual features and the text features to provide explicit guidance. Compared with other pre-training methods, we show that vision-language pre-trained diffusion models can be faster adapted to downstream visual perception tasks using the proposed VPD. Extensive experiments on semantic segmentation, referring image segmentation and depth estimation demonstrates the effectiveness of our method. Notably, VPD attains 0.254 RMSE on NYUv2 depth estimation and 73.3% oIoU on RefCOCO-val referring image segmentation, establishing new records on these two benchmarks.
+
+<!-- [IMAGE] -->
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmsegmentation/assets/26127467/88f5752d-7fe2-4cb0-a284-8ee0680e29cd" width="80%"/>
+</div>
+
+## Usage
+
+To run training or inference with VPD model, please install the required packages via
+
+```sh
+pip install -r requirements/albu.txt
+pip install -r requirements/optional.txt
+```
+
+## Results and models
+
+### NYU
+
+| Method | Backbone              | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device | RMSE  | d1    | d2    | d3    | REL   | log_10 | config                                                                                                      | download                                                                                                                                                                                                                     |
+| ------ | --------------------- | --------- | ------- | -------- | -------------- | ------ | ----- | ----- | ----- | ----- | ----- | ------ | ----------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| VPD    | Stable-Diffusion-v1-5 | 480x480   | 25000   | -        | -              | A100   | 0.253 | 0.964 | 0.995 | 0.999 | 0.069 | 0.030  | [config](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-480x480_20230908-66144bc4.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-480x480_20230908.json) |
+| VPD    | Stable-Diffusion-v1-5 | 512x512   | 25000   | -        | -              | A100   | 0.258 | 0.963 | 0.995 | 0.999 | 0.072 | 0.031  | [config](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-512x512_20230918-60cefcff.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-512x512_20230918.json) |
+
+## Citation
+
+```bibtex
+@article{zhao2023unleashing,
+  title={Unleashing Text-to-Image Diffusion Models for Visual Perception},
+  author={Zhao, Wenliang and Rao, Yongming and Liu, Zuyan and Liu, Benlin and Zhou, Jie and Lu, Jiwen},
+  journal={ICCV},
+  year={2023}
+}
+```
diff --git a/configs/vpd/metafile.yaml b/configs/vpd/metafile.yaml
new file mode 100644
index 0000000000..ccdc0e81eb
--- /dev/null
+++ b/configs/vpd/metafile.yaml
@@ -0,0 +1,56 @@
+Collections:
+- Name: VPD
+  License: Apache License 2.0
+  Metadata:
+    Training Data:
+    - NYU
+  Paper:
+    Title: Unleashing Text-to-Image Diffusion Models for Visual Perception
+    URL: https://arxiv.org/abs/2303.02153
+  README: configs/vpd/README.md
+  Frameworks:
+  - PyTorch
+Models:
+- Name: vpd_sd_4xb8-25k_nyu-480x480
+  In Collection: VPD
+  Results:
+    Task: Depth Estimation
+    Dataset: NYU
+    Metrics:
+      RMSE: 0.253
+  Config: configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py
+  Metadata:
+    Training Data: NYU
+    Batch Size: 32
+    Architecture:
+    - Stable-Diffusion
+    Training Resources: 8x A100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-480x480_20230908-66144bc4.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-480x480_20230908.json
+  Paper:
+    Title: 'High-Resolution Image Synthesis with Latent Diffusion Models'
+    URL: https://arxiv.org/abs/2112.10752
+  Code: https://github.com/open-mmlab/mmsegmentation/tree/main/mmseg/models/backbones/vpd.py#L333
+  Framework: PyTorch
+- Name: vpd_sd_4xb8-25k_nyu-512x512
+  In Collection: VPD
+  Alias: vpd_depth
+  Results:
+    Task: Depth Estimation
+    Dataset: NYU
+    Metrics:
+      RMSE: 0.258
+  Config: configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py
+  Metadata:
+    Training Data: NYU
+    Batch Size: 32
+    Architecture:
+    - Stable-Diffusion
+    Training Resources: 8x A100 GPUS
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-512x512_20230918-60cefcff.pth
+  Training log: https://download.openmmlab.com/mmsegmentation/v0.5/vpd/vpd_sd_4xb8-25k_nyu-512x512_20230918.json
+  Paper:
+    Title: 'High-Resolution Image Synthesis with Latent Diffusion Models'
+    URL: https://arxiv.org/abs/2112.10752
+  Code: https://github.com/open-mmlab/mmsegmentation/tree/main/mmseg/models/backbones/vpd.py#L333
+  Framework: PyTorch
diff --git a/configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py b/configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py
new file mode 100644
index 0000000000..0d14d8dd33
--- /dev/null
+++ b/configs/vpd/vpd_sd_4xb8-25k_nyu-480x480.py
@@ -0,0 +1,38 @@
+_base_ = [
+    '../_base_/models/vpd_sd.py', '../_base_/datasets/nyu.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_25k.py'
+]
+
+crop_size = (480, 480)
+
+model = dict(
+    type='DepthEstimator',
+    data_preprocessor=dict(size=crop_size),
+    backbone=dict(
+        class_embed_path='https://download.openmmlab.com/mmsegmentation/'
+        'v0.5/vpd/nyu_class_embeddings.pth',
+        class_embed_select=True,
+        pad_shape=512,
+        unet_cfg=dict(use_attn=False),
+    ),
+    decode_head=dict(
+        type='VPDDepthHead',
+        in_channels=[320, 640, 1280, 1280],
+        max_depth=10,
+        fmap_border=(1, 1),
+    ),
+    test_cfg=dict(mode='slide_flip', crop_size=crop_size, stride=(160, 160)))
+
+default_hooks = dict(
+    checkpoint=dict(save_best='rmse', rule='less', max_keep_ckpts=1))
+
+# custom optimizer
+optim_wrapper = dict(
+    constructor='ForceDefaultOptimWrapperConstructor',
+    paramwise_cfg=dict(
+        bias_decay_mult=0,
+        force_default_settings=True,
+        custom_keys={
+            'backbone.encoder_vq': dict(lr_mult=0),
+            'backbone.unet': dict(lr_mult=0.01),
+        }))
diff --git a/configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py b/configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py
new file mode 100644
index 0000000000..e89eb9c422
--- /dev/null
+++ b/configs/vpd/vpd_sd_4xb8-25k_nyu-512x512.py
@@ -0,0 +1,37 @@
+_base_ = [
+    '../_base_/models/vpd_sd.py', '../_base_/datasets/nyu_512x512.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_25k.py'
+]
+
+crop_size = (512, 512)
+
+model = dict(
+    type='DepthEstimator',
+    data_preprocessor=dict(size=crop_size),
+    backbone=dict(
+        class_embed_path='https://download.openmmlab.com/mmsegmentation/'
+        'v0.5/vpd/nyu_class_embeddings.pth',
+        class_embed_select=True,
+        pad_shape=512,
+        unet_cfg=dict(use_attn=False),
+    ),
+    decode_head=dict(
+        type='VPDDepthHead',
+        in_channels=[320, 640, 1280, 1280],
+        max_depth=10,
+    ),
+    test_cfg=dict(mode='slide_flip', crop_size=crop_size, stride=(128, 128)))
+
+default_hooks = dict(
+    checkpoint=dict(save_best='rmse', rule='less', max_keep_ckpts=1))
+
+# custom optimizer
+optim_wrapper = dict(
+    constructor='ForceDefaultOptimWrapperConstructor',
+    paramwise_cfg=dict(
+        bias_decay_mult=0,
+        force_default_settings=True,
+        custom_keys={
+            'backbone.encoder_vq': dict(lr_mult=0),
+            'backbone.unet': dict(lr_mult=0.01),
+        }))
diff --git a/dataset-index.yml b/dataset-index.yml
new file mode 100644
index 0000000000..30ff0ee005
--- /dev/null
+++ b/dataset-index.yml
@@ -0,0 +1,80 @@
+openxlab: true
+ade20k:
+  dataset: OpenDataLab/ADE20K_2016
+  download_root: data
+  data_root: data/ade
+
+cityscapes:
+  dataset: OpenDataLab/CityScapes
+  download_root: data
+  data_root: data/cityscapes
+
+voc2012:
+  dataset: OpenDataLab/PASCAL_VOC2012
+  download_root: data
+  data_root: data/VOCdevkit/VOC2012
+
+cocostuff:
+  dataset: OpenDataLab/COCO-Stuff
+  download_root: data
+  data_root: data/coco_stuff164k
+
+mapillary:
+  dataset: OpenDataLab/Mapillary
+  download_root: data
+  data_root: data/mapillary
+
+pascal_context:
+  dataset: OpenDataLab/VOC2010
+  download_root: data
+  data_root: data/VOCdevkit/VOC2010
+
+isaid:
+  dataset: OpenDataLab/iSAID
+  download_root: data
+  data_root: data/iSAID
+
+isprs_potsdam:
+  dataset: OpenDataLab/ISPRS_Potsdam
+  download_root: data
+  data_root: data/potsdam
+
+loveda:
+  dataset: OpenDataLab/LoveDA
+  download_root: data
+  data_root: data/loveDA
+
+chase_db1:
+  dataset: OpenDataLab/CHASE_DB1
+  download_root: data
+  data_root: data/CHASE_DB1
+
+drive:
+  dataset: OpenDataLab/DRIVE
+  download_root: data
+  data_root: data/DRIVE
+
+hrf:
+  dataset: OpenDataLab/HRF
+  download_root: data
+  data_root: data/HRF
+
+stare:
+  dataset: OpenDataLab/STARE
+  download_root: data
+  data_root: data/STARE
+
+synapse:
+  dataset: OpenDataLab/SurgVisDom
+  download_root: data
+  data_root: data/synapse
+
+refuge:
+  dataset: OpenDataLab/REFUGE_Challenge
+  download_root: data
+  data_root: data/REFUGE
+
+lip:
+  dataset: OpenDataLab/LIP
+  download_root: data
+  data_root: data/LIP
diff --git a/demo/MMSegmentation_Tutorial.ipynb b/demo/MMSegmentation_Tutorial.ipynb
index 4a1dbfc58f..ac8601b321 100644
--- a/demo/MMSegmentation_Tutorial.ipynb
+++ b/demo/MMSegmentation_Tutorial.ipynb
@@ -7,7 +7,7 @@
     "id": "view-in-github"
    },
    "source": [
-    "<a href=\"https://colab.research.google.com/github/open-mmlab/mmsegmentation/blob/master/demo/MMSegmentation_Tutorial.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
+    "<a href=\"https://colab.research.google.com/github/open-mmlab/mmsegmentation/blob/main/demo/MMSegmentation_Tutorial.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
    ]
   },
   {
@@ -33,7 +33,7 @@
     "## Install MMSegmentation\n",
     "This step may take several minutes. \n",
     "\n",
-    "We use PyTorch 1.10 and CUDA 11.1 for this tutorial. You may install other versions by change the version number in pip install command. "
+    "We use PyTorch 1.12 and CUDA 11.3 for this tutorial. You may install other versions by change the version number in pip install command. "
    ]
   },
   {
@@ -67,9 +67,13 @@
    "outputs": [],
    "source": [
     "# Install PyTorch\n",
-    "!conda install pytorch=1.10.0 torchvision cudatoolkit=11.1 -c pytorch\n",
+    "!conda install pytorch==1.12.0 torchvision==0.13.0 torchaudio==0.12.0 cudatoolkit=11.3 -c pytorch\n",
+    "# Install mim\n",
+    "!pip install -U openmim\n",
+    "# Install mmengine\n",
+    "!mim install mmengine\n",
     "# Install MMCV\n",
-    "!pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu111/torch1.10/index.html"
+    "!mim install 'mmcv >= 2.0.0rc1'\n"
    ]
   },
   {
@@ -85,7 +89,7 @@
    "outputs": [],
    "source": [
     "!rm -rf mmsegmentation\n",
-    "!git clone https://github.com/open-mmlab/mmsegmentation.git \n",
+    "!git clone -b main https://github.com/open-mmlab/mmsegmentation.git \n",
     "%cd mmsegmentation\n",
     "!pip install -e ."
    ]
@@ -111,110 +115,15 @@
     "print(mmseg.__version__)"
    ]
   },
-  {
-   "cell_type": "markdown",
-   "metadata": {
-    "id": "eUcuC3dUv32I"
-   },
-   "source": [
-    "## Run Inference with MMSeg trained weight"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "2hd41IGaiNet",
-    "outputId": "b7b2aafc-edf2-43e4-ea43-0b5dd0aa4b4a"
-   },
-   "outputs": [],
-   "source": [
-    "!mkdir checkpoints\n",
-    "!wget https://download.openmmlab.com/mmsegmentation/v0.5/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth -P checkpoints"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "H8Fxg8i-wHJE"
-   },
-   "outputs": [],
-   "source": [
-    "from mmseg.apis import inference_model, init_model, show_result_pyplot\n",
-    "from mmseg.utils import get_palette"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "umk8sJ0Xuace"
-   },
-   "outputs": [],
-   "source": [
-    "config_file = 'configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py'\n",
-    "checkpoint_file = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/"
-    },
-    "id": "nWlQFuTgudxu",
-    "outputId": "5e45f4f6-5bcf-4d04-bb9c-0428ee84a576"
-   },
-   "outputs": [],
-   "source": [
-    "# build the model from a config file and a checkpoint file\n",
-    "model = init_model(config_file, checkpoint_file, device='cuda:0')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "id": "izFv6pSRujk9"
-   },
-   "outputs": [],
-   "source": [
-    "# test a single image\n",
-    "img = 'demo/demo.png'\n",
-    "result = inference_model(model, img)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {
-    "colab": {
-     "base_uri": "https://localhost:8080/",
-     "height": 504
-    },
-    "id": "bDcs9udgunQK",
-    "outputId": "7c55f713-4085-47fd-fa06-720a321d0795"
-   },
-   "outputs": [],
-   "source": [
-    "# show the results\n",
-    "show_result_pyplot(model, img, result, get_palette('cityscapes'))"
-   ]
-  },
   {
    "cell_type": "markdown",
    "metadata": {
     "id": "Ta51clKX4cwM"
    },
    "source": [
-    "## Train a semantic segmentation model on a new dataset\n",
+    "## Finetune a semantic segmentation model on a new dataset\n",
     "\n",
-    "To train on a customized dataset, the following steps are necessary. \n",
+    "To finetune on a customized dataset, the following steps are necessary. \n",
     "1. Add a new dataset class. \n",
     "2. Create a config file accordingly. \n",
     "3. Perform training and evaluation. "
@@ -268,8 +177,10 @@
    "source": [
     "# Let's take a look at the dataset\n",
     "import mmcv\n",
+    "import mmengine\n",
     "import matplotlib.pyplot as plt\n",
     "\n",
+    "\n",
     "img = mmcv.imread('iccv09Data/images/6000124.jpg')\n",
     "plt.figure(figsize=(8, 6))\n",
     "plt.imshow(mmcv.bgr2rgb(img))\n",
@@ -293,18 +204,30 @@
    },
    "outputs": [],
    "source": [
-    "import os.path as osp\n",
-    "import numpy as np\n",
-    "from PIL import Image\n",
-    "# convert dataset annotation to semantic segmentation map\n",
+    "# define dataset root and directory for images and annotations\n",
     "data_root = 'iccv09Data'\n",
     "img_dir = 'images'\n",
     "ann_dir = 'labels'\n",
-    "# define class and plaette for better visualization\n",
+    "# define class and palette for better visualization\n",
     "classes = ('sky', 'tree', 'road', 'grass', 'water', 'bldg', 'mntn', 'fg obj')\n",
     "palette = [[128, 128, 128], [129, 127, 38], [120, 69, 125], [53, 125, 34], \n",
-    "           [0, 11, 123], [118, 20, 12], [122, 81, 25], [241, 134, 51]]\n",
-    "for file in mmcv.scandir(osp.join(data_root, ann_dir), suffix='.regions.txt'):\n",
+    "           [0, 11, 123], [118, 20, 12], [122, 81, 25], [241, 134, 51]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "id": "WnGZfribFHCx"
+   },
+   "outputs": [],
+   "source": [
+    "import os.path as osp\n",
+    "import numpy as np\n",
+    "from PIL import Image\n",
+    "\n",
+    "# convert dataset annotation to semantic segmentation map\n",
+    "for file in mmengine.scandir(osp.join(data_root, ann_dir), suffix='.regions.txt'):\n",
     "  seg_map = np.loadtxt(osp.join(data_root, ann_dir, file)).astype(np.uint8)\n",
     "  seg_img = Image.fromarray(seg_map).convert('P')\n",
     "  seg_img.putpalette(np.array(palette, dtype=np.uint8))\n",
@@ -351,8 +274,8 @@
    "source": [
     "# split train/val set randomly\n",
     "split_dir = 'splits'\n",
-    "mmcv.mkdir_or_exist(osp.join(data_root, split_dir))\n",
-    "filename_list = [osp.splitext(filename)[0] for filename in mmcv.scandir(\n",
+    "mmengine.mkdir_or_exist(osp.join(data_root, split_dir))\n",
+    "filename_list = [osp.splitext(filename)[0] for filename in mmengine.scandir(\n",
     "    osp.join(data_root, ann_dir), suffix='.png')]\n",
     "with open(osp.join(data_root, split_dir, 'train.txt'), 'w') as f:\n",
     "  # select first 4/5 as train set\n",
@@ -380,18 +303,15 @@
    },
    "outputs": [],
    "source": [
-    "from mmseg.datasets.builder import DATASETS\n",
-    "from mmseg.datasets.custom import CustomDataset\n",
+    "from mmseg.registry import DATASETS\n",
+    "from mmseg.datasets import BaseSegDataset\n",
     "\n",
-    "@DATASETS.register_module()\n",
-    "class StanfordBackgroundDataset(CustomDataset):\n",
-    "  CLASSES = classes\n",
-    "  PALETTE = palette\n",
-    "  def __init__(self, split, **kwargs):\n",
-    "    super().__init__(img_suffix='.jpg', seg_map_suffix='.png', \n",
-    "                     split=split, **kwargs)\n",
-    "    assert osp.exists(self.img_dir) and self.split is not None\n",
     "\n",
+    "@DATASETS.register_module()\n",
+    "class StanfordBackgroundDataset(BaseSegDataset):\n",
+    "  METAINFO = dict(classes = classes, palette = palette)\n",
+    "  def __init__(self, **kwargs):\n",
+    "    super().__init__(img_suffix='.jpg', seg_map_suffix='.png', **kwargs)\n",
     "    "
    ]
   },
@@ -405,6 +325,16 @@
     "In the next step, we need to modify the config for the training. To accelerate the process, we finetune the model from trained weights."
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Download config and checkpoint files\n",
+    "!mim download mmsegmentation --config pspnet_r50-d8_4xb2-40k_cityscapes-512x1024 --dest ."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -413,8 +343,9 @@
    },
    "outputs": [],
    "source": [
-    "from mmcv import Config\n",
-    "cfg = Config.fromfile('configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py')"
+    "from mmengine import Config\n",
+    "cfg = Config.fromfile('configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py')\n",
+    "print(f'Config:\\n{cfg.pretty_text}')"
    ]
   },
   {
@@ -438,10 +369,10 @@
    },
    "outputs": [],
    "source": [
-    "from mmseg.apis import set_random_seed\n",
-    "\n",
     "# Since we use only one GPU, BN is used instead of SyncBN\n",
     "cfg.norm_cfg = dict(type='BN', requires_grad=True)\n",
+    "cfg.crop_size = (256, 256)\n",
+    "cfg.model.data_preprocessor.size = cfg.crop_size\n",
     "cfg.model.backbone.norm_cfg = cfg.norm_cfg\n",
     "cfg.model.decode_head.norm_cfg = cfg.norm_cfg\n",
     "cfg.model.auxiliary_head.norm_cfg = cfg.norm_cfg\n",
@@ -453,79 +384,55 @@
     "cfg.dataset_type = 'StanfordBackgroundDataset'\n",
     "cfg.data_root = data_root\n",
     "\n",
-    "cfg.data.samples_per_gpu = 8\n",
-    "cfg.data.workers_per_gpu=8\n",
+    "cfg.train_dataloader.batch_size = 8\n",
     "\n",
-    "cfg.img_norm_cfg = dict(\n",
-    "    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)\n",
-    "cfg.crop_size = (256, 256)\n",
     "cfg.train_pipeline = [\n",
     "    dict(type='LoadImageFromFile'),\n",
     "    dict(type='LoadAnnotations'),\n",
-    "    dict(type='Resize', img_scale=(320, 240), ratio_range=(0.5, 2.0)),\n",
+    "    dict(type='RandomResize', scale=(320, 240), ratio_range=(0.5, 2.0), keep_ratio=True),\n",
     "    dict(type='RandomCrop', crop_size=cfg.crop_size, cat_max_ratio=0.75),\n",
-    "    dict(type='RandomFlip', flip_ratio=0.5),\n",
-    "    dict(type='PhotoMetricDistortion'),\n",
-    "    dict(type='Normalize', **cfg.img_norm_cfg),\n",
-    "    dict(type='Pad', size=cfg.crop_size, pad_val=0, seg_pad_val=255),\n",
-    "    dict(type='DefaultFormatBundle'),\n",
-    "    dict(type='Collect', keys=['img', 'gt_semantic_seg']),\n",
+    "    dict(type='RandomFlip', prob=0.5),\n",
+    "    dict(type='PackSegInputs')\n",
     "]\n",
     "\n",
     "cfg.test_pipeline = [\n",
     "    dict(type='LoadImageFromFile'),\n",
-    "    dict(\n",
-    "        type='MultiScaleFlipAug',\n",
-    "        img_scale=(320, 240),\n",
-    "        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],\n",
-    "        flip=False,\n",
-    "        transforms=[\n",
-    "            dict(type='Resize', keep_ratio=True),\n",
-    "            dict(type='RandomFlip'),\n",
-    "            dict(type='Normalize', **cfg.img_norm_cfg),\n",
-    "            dict(type='ImageToTensor', keys=['img']),\n",
-    "            dict(type='Collect', keys=['img']),\n",
-    "        ])\n",
+    "    dict(type='Resize', scale=(320, 240), keep_ratio=True),\n",
+    "    # add loading annotation after ``Resize`` because ground truth\n",
+    "    # does not need to do resize data transform\n",
+    "    dict(type='LoadAnnotations'),\n",
+    "    dict(type='PackSegInputs')\n",
     "]\n",
     "\n",
     "\n",
-    "cfg.data.train.type = cfg.dataset_type\n",
-    "cfg.data.train.data_root = cfg.data_root\n",
-    "cfg.data.train.img_dir = img_dir\n",
-    "cfg.data.train.ann_dir = ann_dir\n",
-    "cfg.data.train.pipeline = cfg.train_pipeline\n",
-    "cfg.data.train.split = 'splits/train.txt'\n",
+    "cfg.train_dataloader.dataset.type = cfg.dataset_type\n",
+    "cfg.train_dataloader.dataset.data_root = cfg.data_root\n",
+    "cfg.train_dataloader.dataset.data_prefix = dict(img_path=img_dir, seg_map_path=ann_dir)\n",
+    "cfg.train_dataloader.dataset.pipeline = cfg.train_pipeline\n",
+    "cfg.train_dataloader.dataset.ann_file = 'splits/train.txt'\n",
+    "\n",
+    "cfg.val_dataloader.dataset.type = cfg.dataset_type\n",
+    "cfg.val_dataloader.dataset.data_root = cfg.data_root\n",
+    "cfg.val_dataloader.dataset.data_prefix = dict(img_path=img_dir, seg_map_path=ann_dir)\n",
+    "cfg.val_dataloader.dataset.pipeline = cfg.test_pipeline\n",
+    "cfg.val_dataloader.dataset.ann_file = 'splits/val.txt'\n",
     "\n",
-    "cfg.data.val.type = cfg.dataset_type\n",
-    "cfg.data.val.data_root = cfg.data_root\n",
-    "cfg.data.val.img_dir = img_dir\n",
-    "cfg.data.val.ann_dir = ann_dir\n",
-    "cfg.data.val.pipeline = cfg.test_pipeline\n",
-    "cfg.data.val.split = 'splits/val.txt'\n",
+    "cfg.test_dataloader = cfg.val_dataloader\n",
     "\n",
-    "cfg.data.test.type = cfg.dataset_type\n",
-    "cfg.data.test.data_root = cfg.data_root\n",
-    "cfg.data.test.img_dir = img_dir\n",
-    "cfg.data.test.ann_dir = ann_dir\n",
-    "cfg.data.test.pipeline = cfg.test_pipeline\n",
-    "cfg.data.test.split = 'splits/val.txt'\n",
     "\n",
-    "# We can still use the pre-trained Mask RCNN model though we do not need to\n",
-    "# use the mask branch\n",
-    "cfg.load_from = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'\n",
+    "# Load the pretrained weights\n",
+    "cfg.load_from = 'pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'\n",
     "\n",
     "# Set up working dir to save files and logs.\n",
     "cfg.work_dir = './work_dirs/tutorial'\n",
     "\n",
-    "cfg.runner.max_iters = 200\n",
-    "cfg.log_config.interval = 10\n",
-    "cfg.evaluation.interval = 200\n",
-    "cfg.checkpoint_config.interval = 200\n",
+    "cfg.train_cfg.max_iters = 200\n",
+    "cfg.train_cfg.val_interval = 200\n",
+    "cfg.default_hooks.logger.interval = 10\n",
+    "cfg.default_hooks.checkpoint.interval = 200\n",
     "\n",
-    "# Set seed to facitate reproducing the result\n",
-    "cfg.seed = 0\n",
-    "set_random_seed(0, deterministic=False)\n",
-    "cfg.gpu_ids = range(1)\n",
+    "# Set seed to facilitate reproducing the result\n",
+    "cfg['randomness'] = dict(seed=0)\n",
     "\n",
     "# Let's have a look at the final config used for training\n",
     "print(f'Config:\\n{cfg.pretty_text}')"
@@ -552,23 +459,19 @@
    },
    "outputs": [],
    "source": [
-    "from mmseg.datasets import build_dataset\n",
-    "from mmseg.models import build_segmentor\n",
-    "from mmseg.apis import train_segmentor\n",
-    "\n",
-    "\n",
-    "# Build the dataset\n",
-    "datasets = [build_dataset(cfg.data.train)]\n",
+    "from mmengine.runner import Runner\n",
     "\n",
-    "# Build the detector\n",
-    "model = build_segmentor(cfg.model)\n",
-    "# Add an attribute for visualization convenience\n",
-    "model.CLASSES = datasets[0].CLASSES\n",
-    "\n",
-    "# Create work_dir\n",
-    "mmcv.mkdir_or_exist(osp.abspath(cfg.work_dir))\n",
-    "train_segmentor(model, datasets, cfg, distributed=False, validate=True, \n",
-    "                meta=dict())"
+    "runner = Runner.from_cfg(cfg)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# start training\n",
+    "runner.train()"
    ]
   },
   {
@@ -593,20 +496,18 @@
    },
    "outputs": [],
    "source": [
-    "img = mmcv.imread('iccv09Data/images/6000124.jpg')\n",
+    "from mmseg.apis import init_model, inference_model, show_result_pyplot\n",
     "\n",
-    "model.cfg = cfg\n",
+    "# Init the model from the config and the checkpoint\n",
+    "checkpoint_path = './work_dirs/tutorial/iter_200.pth'\n",
+    "model = init_model(cfg, checkpoint_path, 'cuda:0')\n",
+    "\n",
+    "img = mmcv.imread('iccv09Data/images/6000124.jpg')\n",
     "result = inference_model(model, img)\n",
     "plt.figure(figsize=(8, 6))\n",
-    "show_result_pyplot(model, img, result, palette)"
+    "vis_result = show_result_pyplot(model, img, result)\n",
+    "plt.imshow(mmcv.bgr2rgb(vis_result))\n"
    ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
   }
  ],
  "metadata": {
@@ -618,7 +519,7 @@
    "provenance": []
   },
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "Python 3.10.6 ('pt1.12')",
    "language": "python",
    "name": "python3"
   },
@@ -632,7 +533,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.10.6"
   },
   "pycharm": {
    "stem_cell": {
@@ -642,6 +543,11 @@
     },
     "source": []
    }
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "0442e67aee3d9cbb788fa6e86d60c4ffa94ad7f1943c65abfecb99a6f4696c58"
+   }
   }
  },
  "nbformat": 4,
diff --git a/demo/classroom__rgb_00283.jpg b/demo/classroom__rgb_00283.jpg
new file mode 100644
index 0000000000..1df37e9248
Binary files /dev/null and b/demo/classroom__rgb_00283.jpg differ
diff --git a/demo/image_demo.py b/demo/image_demo.py
index 5cde1ac9cd..ebc34c80b2 100644
--- a/demo/image_demo.py
+++ b/demo/image_demo.py
@@ -1,8 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from argparse import ArgumentParser
 
+from mmengine.model import revert_sync_batchnorm
+
 from mmseg.apis import inference_model, init_model, show_result_pyplot
-from mmseg.utils import get_palette
 
 
 def main():
@@ -10,21 +11,27 @@ def main():
     parser.add_argument('img', help='Image file')
     parser.add_argument('config', help='Config file')
     parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('--out-file', default=None, help='Path to output file')
     parser.add_argument(
         '--device', default='cuda:0', help='Device used for inference')
-    parser.add_argument(
-        '--palette',
-        default='cityscapes',
-        help='Color palette used for segmentation map')
     parser.add_argument(
         '--opacity',
         type=float,
         default=0.5,
         help='Opacity of painted segmentation map. In (0, 1] range.')
+    parser.add_argument(
+        '--with-labels',
+        action='store_true',
+        default=False,
+        help='Whether to display the class labels.')
+    parser.add_argument(
+        '--title', default='result', help='The image identifier.')
     args = parser.parse_args()
 
     # build the model from a config file and a checkpoint file
     model = init_model(args.config, args.checkpoint, device=args.device)
+    if args.device == 'cpu':
+        model = revert_sync_batchnorm(model)
     # test a single image
     result = inference_model(model, args.img)
     # show the results
@@ -32,8 +39,12 @@ def main():
         model,
         args.img,
         result,
-        get_palette(args.palette),
-        opacity=args.opacity)
+        title=args.title,
+        opacity=args.opacity,
+        with_labels=args.with_labels,
+        draw_gt=False,
+        show=False if args.out_file is not None else True,
+        out_file=args.out_file)
 
 
 if __name__ == '__main__':
diff --git a/demo/image_demo_with_inferencer.py b/demo/image_demo_with_inferencer.py
new file mode 100644
index 0000000000..d1fa9deb9e
--- /dev/null
+++ b/demo/image_demo_with_inferencer.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+from mmseg.apis import MMSegInferencer
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('model', help='Config file')
+    parser.add_argument('--checkpoint', default=None, help='Checkpoint file')
+    parser.add_argument(
+        '--out-dir', default='', help='Path to save result file')
+    parser.add_argument(
+        '--show',
+        action='store_true',
+        default=False,
+        help='Whether to display the drawn image.')
+    parser.add_argument(
+        '--dataset-name',
+        default='cityscapes',
+        help='Color palette used for segmentation map')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--opacity',
+        type=float,
+        default=0.5,
+        help='Opacity of painted segmentation map. In (0, 1] range.')
+    parser.add_argument(
+        '--with-labels',
+        action='store_true',
+        default=False,
+        help='Whether to display the class labels.')
+    args = parser.parse_args()
+
+    # build the model from a config file and a checkpoint file
+    mmseg_inferencer = MMSegInferencer(
+        args.model,
+        args.checkpoint,
+        dataset_name=args.dataset_name,
+        device=args.device)
+
+    # test a single image
+    mmseg_inferencer(
+        args.img,
+        show=args.show,
+        out_dir=args.out_dir,
+        opacity=args.opacity,
+        with_labels=args.with_labels)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/inference_demo.ipynb b/demo/inference_demo.ipynb
index e54d509ff7..455c5df4e1 100644
--- a/demo/inference_demo.ipynb
+++ b/demo/inference_demo.ipynb
@@ -20,8 +20,10 @@
    },
    "outputs": [],
    "source": [
-    "from mmseg.apis import init_model, inference_model, show_result_pyplot\n",
-    "from mmseg.utils import get_palette"
+    "import torch\n",
+    "import matplotlib.pyplot as plt\n",
+    "from mmengine.model.utils import revert_sync_batchnorm\n",
+    "from mmseg.apis import init_model, inference_model, show_result_pyplot"
    ]
   },
   {
@@ -34,7 +36,7 @@
    },
    "outputs": [],
    "source": [
-    "config_file = '../configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py'\n",
+    "config_file = '../configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'\n",
     "checkpoint_file = '../checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'"
    ]
   },
@@ -45,7 +47,7 @@
    "outputs": [],
    "source": [
     "# build the model from a config file and a checkpoint file\n",
-    "model = init_model(config_file, checkpoint_file, device='cuda:0')"
+    "model = init_model(config_file, checkpoint_file, device='cpu')"
    ]
   },
   {
@@ -56,6 +58,8 @@
    "source": [
     "# test a single image\n",
     "img = 'demo.png'\n",
+    "if not torch.cuda.is_available():\n",
+    "    model = revert_sync_batchnorm(model)\n",
     "result = inference_model(model, img)"
    ]
   },
@@ -66,7 +70,8 @@
    "outputs": [],
    "source": [
     "# show the results\n",
-    "show_result_pyplot(model, img, result, get_palette('cityscapes'))"
+    "vis_result = show_result_pyplot(model, img, result, show=False)\n",
+    "plt.imshow(vis_result)"
    ]
   },
   {
@@ -79,7 +84,7 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "Python 3",
+   "display_name": "pt1.13",
    "language": "python",
    "name": "python3"
   },
@@ -93,7 +98,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.7.0"
+   "version": "3.10.11"
   },
   "pycharm": {
    "stem_cell": {
@@ -103,6 +108,11 @@
     },
     "source": []
    }
+  },
+  "vscode": {
+   "interpreter": {
+    "hash": "f61d5b8fecdd960739697f6c2860080d7b76a5be5d896cb034bdb275ab3ddda0"
+   }
   }
  },
  "nbformat": 4,
diff --git a/demo/rs_image_inference.py b/demo/rs_image_inference.py
new file mode 100644
index 0000000000..799181f93c
--- /dev/null
+++ b/demo/rs_image_inference.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+
+from mmseg.apis import RSImage, RSInferencer
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('image', help='Image file path')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--output-path',
+        help='Path to save result image',
+        default='result.png')
+    parser.add_argument(
+        '--batch-size',
+        type=int,
+        default=1,
+        help='maximum number of windows inferred simultaneously')
+    parser.add_argument(
+        '--window-size',
+        help='window xsize,ysize',
+        default=(224, 224),
+        type=int,
+        nargs=2)
+    parser.add_argument(
+        '--stride',
+        help='window xstride,ystride',
+        default=(224, 224),
+        type=int,
+        nargs=2)
+    parser.add_argument(
+        '--thread', default=1, type=int, help='number of inference threads')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    args = parser.parse_args()
+    inferencer = RSInferencer.from_config_path(
+        args.config,
+        args.checkpoint,
+        batch_size=args.batch_size,
+        thread=args.thread,
+        device=args.device)
+    image = RSImage(args.image)
+
+    inferencer.run(image, args.window_size, args.stride, args.output_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/demo/video_demo.py b/demo/video_demo.py
index 5b844f1617..7e6f3d605c 100644
--- a/demo/video_demo.py
+++ b/demo/video_demo.py
@@ -2,9 +2,10 @@
 from argparse import ArgumentParser
 
 import cv2
+from mmengine.model.utils import revert_sync_batchnorm
 
 from mmseg.apis import inference_model, init_model
-from mmseg.utils import get_palette
+from mmseg.apis.inference import show_result_pyplot
 
 
 def main():
@@ -53,8 +54,12 @@ def main():
 
     # build the model from a config file and a checkpoint file
     model = init_model(args.config, args.checkpoint, device=args.device)
+    if args.device == 'cpu':
+        model = revert_sync_batchnorm(model)
 
     # build input video
+    if args.video.isdigit():
+        args.video = int(args.video)
     cap = cv2.VideoCapture(args.video)
     assert (cap.isOpened())
     input_height = cap.get(cv2.CAP_PROP_FRAME_HEIGHT)
@@ -86,12 +91,7 @@ def main():
             result = inference_model(model, frame)
 
             # blend raw image and prediction
-            draw_img = model.show_result(
-                frame,
-                result,
-                palette=get_palette(args.palette),
-                show=False,
-                opacity=args.opacity)
+            draw_img = show_result_pyplot(model, frame, result)
 
             if args.show:
                 cv2.imshow('video_demo', draw_img)
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 64482b4725..26420ddbbb 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,6 +1,7 @@
 ARG PYTORCH="1.11.0"
 ARG CUDA="11.3"
 ARG CUDNN="8"
+ARG MMCV="2.0.1"
 
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
@@ -12,7 +13,7 @@ ENV CMAKE_PREFIX_PATH="$(dirname $(which conda))/../"
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64/3bf863cc.pub
 RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64/7fa2af80.pub
 
-RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 \
+RUN apt-get update && apt-get install -y git ninja-build libglib2.0-0 libsm6 libxrender-dev libxext6 libgl1-mesa-dev  \
     && apt-get clean \
     && rm -rf /var/lib/apt/lists/*
 
@@ -22,10 +23,12 @@ RUN conda clean --all
 ARG PYTORCH
 ARG CUDA
 ARG MMCV
-RUN ["/bin/bash", "-c", "pip install --no-cache-dir mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"]
+RUN ["/bin/bash", "-c", "pip install openmim"]
+RUN ["/bin/bash", "-c", "mim install mmengine"]
+RUN ["/bin/bash", "-c", "mim install mmcv==${MMCV}"]
 
 # Install MMSegmentation
-RUN git clone https://github.com/open-mmlab/mmsegmentation.git /mmsegmentation
+RUN git clone -b main https://github.com/open-mmlab/mmsegmentation.git /mmsegmentation
 WORKDIR /mmsegmentation
 ENV FORCE_CUDA="1"
 RUN pip install -r requirements.txt
diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile
index c1d154528c..38f91baa5b 100644
--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@@ -3,8 +3,8 @@ ARG CUDA="11.3"
 ARG CUDNN="8"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
-ARG MMCV="1.4.8"
-ARG MMSEG="0.24.1"
+ARG MMCV="2.0.1"
+ARG MMSEG="1.2.2"
 
 ENV PYTHONUNBUFFERED TRUE
 
@@ -26,7 +26,9 @@ RUN pip install torchserve torch-model-archiver
 # MMLAB
 ARG PYTORCH
 ARG CUDA
-RUN ["/bin/bash", "-c", "pip install mmcv-full==${MMCV} -f https://download.openmmlab.com/mmcv/dist/cu${CUDA//./}/torch${PYTORCH}/index.html"]
+RUN ["/bin/bash", "-c", "pip install openmim"]
+RUN ["/bin/bash", "-c", "mim install mmengine"]
+RUN ["/bin/bash", "-c", "mim install mmcv==${MMCV}"]
 RUN pip install mmsegmentation==${MMSEG}
 
 RUN useradd -m model-server \
diff --git a/docs/en/advanced_guides/add_datasets.md b/docs/en/advanced_guides/add_datasets.md
new file mode 100644
index 0000000000..fbfa36940c
--- /dev/null
+++ b/docs/en/advanced_guides/add_datasets.md
@@ -0,0 +1,199 @@
+# Add New Datasets
+
+## Add new custom dataset
+
+Here we show how to develop a new custom dataset.
+
+1. Create a new file `mmseg/datasets/example.py`
+
+   ```python
+   from mmseg.registry import DATASETS
+   from .basesegdataset import BaseSegDataset
+
+
+   @DATASETS.register_module()
+   class ExampleDataset(BaseSegDataset):
+
+       METAINFO = dict(
+           classes=('xxx', 'xxx', ...),
+           palette=[[x, x, x], [x, x, x], ...])
+
+       def __init__(self, aeg1, arg2):
+           pass
+   ```
+
+2. Import the module in `mmseg/datasets/__init__.py`
+
+   ```python
+   from .example import ExampleDataset
+   ```
+
+3. Use it by creating a new new dataset config file `configs/_base_/datasets/example_dataset.py`
+
+   ```python
+   dataset_type = 'ExampleDataset'
+   data_root = 'data/example/'
+   ...
+   ```
+
+4. Add dataset meta information in `mmseg/utils/class_names.py`
+
+   ```python
+   def example_classes():
+       return [
+           'xxx', 'xxx',
+           ...
+       ]
+
+   def example_palette():
+       return [
+           [x, x, x], [x, x, x],
+           ...
+       ]
+   dataset_aliases ={
+       'example': ['example', ...],
+       ...
+   }
+   ```
+
+**Note:** If the new dataset does not satisfy the mmseg requirements, a data preprocessing script needs to be prepared in `tools/dataset_converters/`
+
+## Customize datasets by reorganizing data
+
+The simplest way is to convert your dataset to organize your data into folders.
+
+An example of file structure is as followed.
+
+```none
+├── data
+│   ├── my_dataset
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   │   ├── xxx{img_suffix}
+│   │   │   │   ├── yyy{img_suffix}
+│   │   │   │   ├── zzz{img_suffix}
+│   │   │   ├── val
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   │   ├── xxx{seg_map_suffix}
+│   │   │   │   ├── yyy{seg_map_suffix}
+│   │   │   │   ├── zzz{seg_map_suffix}
+│   │   │   ├── val
+
+```
+
+A training pair will consist of the files with same suffix in img_dir/ann_dir.
+
+Some datasets don't release the test set or don't release the ground truth of the test set, and we cannot evaluate models locally without the ground truth of the test set, so we set the validation set as the default test set in config files.
+
+About how to build your own datasets or implement a new dataset class please refer to the [datasets guide](./datasets.md) for more detailed information.
+
+**Note:** The annotations are images of shape (H, W), the value pixel should fall in range `[0, num_classes - 1]`.
+You may use `'P'` mode of [pillow](https://pillow.readthedocs.io/en/stable/handbook/concepts.html#palette) to create your annotation image with color.
+
+## Customize datasets by mixing dataset
+
+MMSegmentation also supports to mix dataset for training.
+Currently it supports to concat, repeat and multi-image mix datasets.
+
+### Repeat dataset
+
+We use `RepeatDataset` as wrapper to repeat the dataset.
+For example, suppose the original dataset is `Dataset_A`, to repeat it, the config looks like the following
+
+```python
+dataset_A_train = dict(
+    type='RepeatDataset',
+    times=N,
+    dataset=dict(  # This is the original config of Dataset_A
+        type='Dataset_A',
+        ...
+        pipeline=train_pipeline
+    )
+)
+```
+
+### Concatenate dataset
+
+In case the dataset you want to concatenate is different, you can concatenate the dataset configs like the following.
+
+```python
+dataset_A_train = dict()
+dataset_B_train = dict()
+concatenate_dataset = dict(
+    type='ConcatDataset',
+    datasets=[dataset_A_train, dataset_B_train])
+```
+
+A more complex example that repeats `Dataset_A` and `Dataset_B` by N and M times, respectively, and then concatenates the repeated datasets is as the following.
+
+```python
+dataset_A_train = dict(
+    type='RepeatDataset',
+    times=N,
+    dataset=dict(
+        type='Dataset_A',
+        ...
+        pipeline=train_pipeline
+    )
+)
+dataset_A_val = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_A_test = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_B_train = dict(
+    type='RepeatDataset',
+    times=M,
+    dataset=dict(
+        type='Dataset_B',
+        ...
+        pipeline=train_pipeline
+    )
+)
+train_dataloader = dict(
+    dataset=dict(
+        type='ConcatDataset',
+        datasets=[dataset_A_train, dataset_B_train]))
+
+val_dataloader = dict(dataset=dataset_A_val)
+test_dataloader = dict(dataset=dataset_A_test)
+
+```
+
+You can refer base dataset [tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/basedataset.html) from mmengine for more details
+
+### Multi-image Mix Dataset
+
+We use `MultiImageMixDataset` as a wrapper to mix images from multiple datasets.
+`MultiImageMixDataset` can be used by multiple images mixed data augmentation like mosaic and mixup.
+
+An example of using `MultiImageMixDataset` with `Mosaic` data augmentation:
+
+```python
+train_pipeline = [
+    dict(type='RandomMosaic', prob=1),
+    dict(type='Resize', img_scale=(1024, 512), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackSegInputs')
+]
+
+train_dataset = dict(
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        reduce_zero_label=False,
+        img_dir=data_root + "images/train",
+        ann_dir=data_root + "annotations/train",
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations'),
+        ]
+    ),
+    pipeline=train_pipeline
+)
+
+```
diff --git a/docs/en/advanced_guides/add_metrics.md b/docs/en/advanced_guides/add_metrics.md
new file mode 100644
index 0000000000..0298826f05
--- /dev/null
+++ b/docs/en/advanced_guides/add_metrics.md
@@ -0,0 +1,81 @@
+# Add New Metrics
+
+## Develop with the source code of MMSegmentation
+
+Here we show how to develop a new metric with an example of `CustomMetric` as the following.
+
+1. Create a new file `mmseg/evaluation/metrics/custom_metric.py`.
+
+   ```python
+   from typing import List, Sequence
+
+   from mmengine.evaluator import BaseMetric
+
+   from mmseg.registry import METRICS
+
+
+   @METRICS.register_module()
+   class CustomMetric(BaseMetric):
+
+       def __init__(self, arg1, arg2):
+           """
+           The metric first processes each batch of data_samples and predictions,
+           and appends the processed results to the results list. Then it
+           collects all results together from all ranks if distributed training
+           is used. Finally, it computes the metrics of the entire dataset.
+           """
+
+       def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+           pass
+
+       def compute_metrics(self, results: list) -> dict:
+           pass
+
+       def evaluate(self, size: int) -> dict:
+           pass
+   ```
+
+   In the above example, `CustomMetric` is a subclass of `BaseMetric`. It has three methods: `process`, `compute_metrics` and `evaluate`.
+
+   - `process()` process one batch of data samples and predictions. The processed results are stored in `self.results` which will be used to compute the metrics after all the data samples are processed. Please refer to [MMEngine documentation](https://github.com/open-mmlab/mmengine/blob/main/docs/en/design/evaluation.md) for more details.
+
+   - `compute_metrics()` is used to compute the metrics from the processed results.
+
+   - `evaluate()` is an interface to compute the metrics and return the results. It will be called by `ValLoop` or `TestLoop` in the `Runner`. In most cases, you don't need to override this method, but you can override it if you want to do some extra work.
+
+   **Note:** You might find the details of calling `evaluate()` method in the `Runner` [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L366). The `Runner` is the executor of the training and testing process, you can find more details about it at the [engine document](./engine.md).
+
+2. Import the new metric in `mmseg/evaluation/metrics/__init__.py`.
+
+   ```python
+   from .custom_metric import CustomMetric
+   __all__ = ['CustomMetric', ...]
+   ```
+
+3. Add the new metric to the config file.
+
+   ```python
+   val_evaluator = dict(type='CustomMetric', arg1=xxx, arg2=xxx)
+   test_evaluator = dict(type='CustomMetric', arg1=xxx, arg2=xxx)
+   ```
+
+## Develop with the released version of MMSegmentation
+
+The above example shows how to develop a new metric with the source code of MMSegmentation. If you want to develop a new metric with the released version of MMSegmentation, you can follow the following steps.
+
+1. Create a new file `/Path/to/metrics/custom_metric.py`, implement the `process`, `compute_metrics` and `evaluate` methods, `evaluate` method is optional.
+
+2. Import the new metric in your code or config file.
+
+   ```python
+   from path.to.metrics import CustomMetric
+   ```
+
+   or
+
+   ```python
+   custom_imports = dict(imports=['/Path/to/metrics'], allow_failed_imports=False)
+
+   val_evaluator = dict(type='CustomMetric', arg1=xxx, arg2=xxx)
+   test_evaluator = dict(type='CustomMetric', arg1=xxx, arg2=xxx)
+   ```
diff --git a/docs/en/advanced_guides/add_models.md b/docs/en/advanced_guides/add_models.md
new file mode 100644
index 0000000000..ed5c9ce611
--- /dev/null
+++ b/docs/en/advanced_guides/add_models.md
@@ -0,0 +1,260 @@
+# Add New Modules
+
+## Develop new components
+
+We can customize all the components introduced at [the model documentation](./models.md), such as **backbone**, **head**, **loss function** and **data preprocessor**.
+
+### Add new backbones
+
+Here we show how to develop a new backbone with an example of MobileNet.
+
+1. Create a new file `mmseg/models/backbones/mobilenet.py`.
+
+   ```python
+   import torch.nn as nn
+
+   from mmseg.registry import MODELS
+
+
+   @MODELS.register_module()
+   class MobileNet(nn.Module):
+
+       def __init__(self, arg1, arg2):
+           pass
+
+       def forward(self, x):  # should return a tuple
+           pass
+
+       def init_weights(self, pretrained=None):
+           pass
+   ```
+
+2. Import the module in `mmseg/models/backbones/__init__.py`.
+
+   ```python
+   from .mobilenet import MobileNet
+   ```
+
+3. Use it in your config file.
+
+   ```python
+   model = dict(
+       ...
+       backbone=dict(
+           type='MobileNet',
+           arg1=xxx,
+           arg2=xxx),
+       ...
+   ```
+
+### Add new heads
+
+In MMSegmentation, we provide a [BaseDecodeHead](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/decode_heads/decode_head.py#L17) for developing all segmentation heads.
+All newly implemented decode heads should be derived from it.
+Here we show how to develop a new head with the example of [PSPNet](https://arxiv.org/abs/1612.01105) as the following.
+
+First, add a new decode head in `mmseg/models/decode_heads/psp_head.py`.
+PSPNet implements a decode head for segmentation decode.
+To implement a decode head, we need to implement three functions of the new module as the following.
+
+```python
+from mmseg.registry import MODELS
+
+@MODELS.register_module()
+class PSPHead(BaseDecodeHead):
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super(PSPHead, self).__init__(**kwargs)
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        pass
+```
+
+Next, the users need to add the module in the `mmseg/models/decode_heads/__init__.py`, thus the corresponding registry could find and load them.
+
+To config file of PSPNet is as the following
+
+```python
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='pretrain_model/resnet50_v1c_trick-2cccc1ad.pth',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
+
+```
+
+### Add new loss
+
+Assume you want to add a new loss as `MyLoss` for segmentation decode.
+To add a new loss function, the users need to implement it in `mmseg/models/losses/my_loss.py`.
+The decorator `weighted_loss` enables the loss to be weighted for each element.
+
+```python
+import torch
+import torch.nn as nn
+
+from mmseg.registry import MODELS
+from .utils import weighted_loss
+
+@weighted_loss
+def my_loss(pred, target):
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+@MODELS.register_module()
+class MyLoss(nn.Module):
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(MyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * my_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
+```
+
+Then the users need to add it in the `mmseg/models/losses/__init__.py`.
+
+```python
+from .my_loss import MyLoss, my_loss
+
+```
+
+To use it, modify the `loss_xxx` field.
+Then you need to modify the `loss_decode` field in the head.
+`loss_weight` could be used to balance multiple losses.
+
+```python
+loss_decode=dict(type='MyLoss', loss_weight=1.0))
+```
+
+### Add new data preprocessor
+
+In MMSegmentation 1.x versions, we use [SegDataPreProcessor](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/data_preprocessor.py#L13) to copy data to the target device and preprocess the data into the model input format as default. Here we show how to develop a new data preprocessor.
+
+1. Create a new file `mmseg/models/my_datapreprocessor.py`.
+
+   ```python
+   from mmengine.model import BaseDataPreprocessor
+
+   from mmseg.registry import MODELS
+
+   @MODELS.register_module()
+   class MyDataPreProcessor(BaseDataPreprocessor):
+       def __init__(self, **kwargs):
+           super().__init__(**kwargs)
+
+       def forward(self, data: dict, training: bool=False) -> Dict[str, Any]:
+           # TODO Define the logic for data pre-processing in the forward method
+           pass
+   ```
+
+2. Import your data preprocessor in `mmseg/models/__init__.py`
+
+   ```python
+   from .my_datapreprocessor import MyDataPreProcessor
+   ```
+
+3. Use it in your config file.
+
+   ```python
+   model = dict(
+       data_preprocessor=dict(type='MyDataPreProcessor)
+       ...
+   )
+   ```
+
+## Develop new segmentors
+
+The segmentor is an algorithmic architecture in which users can customize their algorithms by adding customized components and defining the logic of algorithm execution. Please refer to [the model document](./models.md) for more details.
+
+Since the [BaseSegmentor](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/segmentors/base.py#L15) in MMSegmentation unifies three modes for a forward process, to develop a new segmentor, users need to overwrite `loss`, `predict` and `_forward` methods corresponding to the `loss`, `predict` and `tensor` modes.
+
+Here we show how to develop a new segmentor.
+
+1. Create a new file `mmseg/models/segmentors/my_segmentor.py`.
+
+   ```python
+    from typing import Dict, Optional, Union
+
+    import torch
+
+    from mmseg.registry import MODELS
+    from mmseg.models import BaseSegmentor
+
+    @MODELS.register_module()
+    class MySegmentor(BaseSegmentor):
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+            # TODO users should build components of the network here
+
+        def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+            """Calculate losses from a batch of inputs and data samples."""
+            pass
+
+        def predict(self, inputs: Tensor, data_samples: OptSampleList=None) -> SampleList:
+            """Predict results from a batch of inputs and data samples with post-
+            processing."""
+            pass
+
+       def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+            """Network forward process.
+
+            Usually includes backbone, neck and head forward without any post-
+            processing.
+            """
+            pass
+   ```
+
+2. Import your segmentor in `mmseg/models/segmentors/__init__.py`.
+
+   ```python
+   from .my_segmentor import MySegmentor
+   ```
+
+3. Use it in your config file.
+
+   ```python
+   model = dict(
+       type='MySegmentor'
+       ...
+   )
+   ```
diff --git a/docs/en/advanced_guides/add_transforms.md b/docs/en/advanced_guides/add_transforms.md
new file mode 100644
index 0000000000..ca336ce046
--- /dev/null
+++ b/docs/en/advanced_guides/add_transforms.md
@@ -0,0 +1,52 @@
+# Adding New Data Transforms
+
+## Customization data transformation
+
+The customized data transformation must inherited from `BaseTransform` and implement `transform` function.
+Here we use a simple flipping transformation as example:
+
+```python
+import random
+import mmcv
+from mmcv.transforms import BaseTransform, TRANSFORMS
+
+@TRANSFORMS.register_module()
+class MyFlip(BaseTransform):
+    def __init__(self, direction: str):
+        super().__init__()
+        self.direction = direction
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        results['img'] = mmcv.imflip(img, direction=self.direction)
+        return results
+```
+
+Moreover, import the new class.
+
+```python
+from .my_pipeline import MyFlip
+```
+
+Thus, we can instantiate a `MyFlip` object and use it to process the data dict.
+
+```python
+import numpy as np
+
+transform = MyFlip(direction='horizontal')
+data_dict = {'img': np.random.rand(224, 224, 3)}
+data_dict = transform(data_dict)
+processed_img = data_dict['img']
+```
+
+Or, we can use `MyFlip` transformation in data pipeline in our config file.
+
+```python
+pipeline = [
+    ...
+    dict(type='MyFlip', direction='horizontal'),
+    ...
+]
+```
+
+Note that if you want to use `MyFlip` in config, you must ensure the file containing `MyFlip` is imported during runtime.
diff --git a/docs/en/advanced_guides/customize_runtime.md b/docs/en/advanced_guides/customize_runtime.md
new file mode 100644
index 0000000000..33281bfe4a
--- /dev/null
+++ b/docs/en/advanced_guides/customize_runtime.md
@@ -0,0 +1,168 @@
+# Customize Runtime Settings
+
+## Customize hooks
+
+### Step 1: Implement a new hook
+
+MMEngine has implemented commonly used [hooks](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/hook.md) for training and test,
+When users have requirements for customization, they can follow examples below.
+For example, if some hyper-parameter of the model needs to be changed when model training, we can implement a new hook for it:
+
+```python
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+
+from mmseg.registry import HOOKS
+
+
+@HOOKS.register_module()
+class NewHook(Hook):
+    """Docstring for NewHook.
+    """
+
+    def __init__(self, a: int, b: int) -> None:
+        self.a = a
+        self.b = b
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: Optional[Sequence[dict]] = None) -> None:
+        cur_iter = runner.iter
+        # acquire this model when it is in a wrapper
+        if is_model_wrapper(runner.model):
+          model = runner.model.module
+        model.hyper_parameter = self.a * cur_iter + self.b
+```
+
+### Step 2: Import a new hook
+
+The module which is defined above needs to be imported into main namespace first to ensure being registered.
+We assume `NewHook` is implemented in `mmseg/engine/hooks/new_hook.py`, there are two ways to import it:
+
+- Import it by modifying `mmseg/engine/hooks/__init__.py`.
+  Modules should be imported in `mmseg/engine/hooks/__init__.py` thus these new modules can be found and added by registry.
+
+```python
+from .new_hook import NewHook
+
+__all__ = [..., NewHook]
+```
+
+- Import it manually by `custom_imports` in config file.
+
+```python
+custom_imports = dict(imports=['mmseg.engine.hooks.new_hook'], allow_failed_imports=False)
+```
+
+### Step 3: Modify config file
+
+Users can set and use customized hooks in training and test followed methods below.
+The execution priority of hooks at the same place of `Runner` can be referred [here](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/hook.md#built-in-hooks),
+Default priority of customized hook is `NORMAL`.
+
+```python
+custom_hooks = [
+    dict(type='NewHook', a=a_value, b=b_value, priority='ABOVE_NORMAL')
+]
+```
+
+## Customize optimizer
+
+### Step 1: Implement a new optimizer
+
+We recommend the customized optimizer implemented in `mmseg/engine/optimizers/my_optimizer.py`. Here is an example of a new optimizer `MyOptimizer` which has parameters `a`, `b` and `c`:
+
+```python
+from mmseg.registry import OPTIMIZERS
+from torch.optim import Optimizer
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c)
+```
+
+### Step 2: Import a new optimizer
+
+The module which is defined above needs to be imported into main namespace first to ensure being registered.
+We assume `MyOptimizer` is implemented in `mmseg/engine/optimizers/my_optimizer.py`, there are two ways to import it:
+
+- Import it by modifying `mmseg/engine/optimizers/__init__.py`.
+  Modules should be imported in `mmseg/engine/optimizers/__init__.py` thus these new modules can be found and added by registry.
+
+```python
+from .my_optimizer import MyOptimizer
+```
+
+- Import it manually by `custom_imports` in config file.
+
+```python
+custom_imports = dict(imports=['mmseg.engine.optimizers.my_optimizer'], allow_failed_imports=False)
+```
+
+### Step 3: Modify config file
+
+Then it needs to modify `optimizer` in `optim_wrapper` of config file, if users want to use customized `MyOptimizer`, it can be modified as:
+
+```python
+optim_wrapper = dict(type='OptimWrapper',
+                     optimizer=dict(type='MyOptimizer',
+                                    a=a_value, b=b_value, c=c_value),
+                     clip_grad=None)
+```
+
+## Customize optimizer constructor
+
+### Step 1: Implement a new optimizer constructor
+
+Optimizer constructor is used to create optimizer and optimizer wrapper for model training, which has powerful functions like specifying learning rate and weight decay for different model layers.
+Here is an example for a customized optimizer constructor.
+
+```python
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmseg.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor):
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
+
+    def __call__(self, model):
+
+        return my_optimizer
+```
+
+Default optimizer constructor is implemented [here](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L19).
+It can also be used as base class of new optimizer constructor.
+
+### Step 2: Import a new optimizer constructor
+
+The module which is defined above needs to be imported into main namespace first to ensure being registered.
+We assume `MyOptimizerConstructor` is implemented in `mmseg/engine/optimizers/my_optimizer_constructor.py`, there are two ways to import it:
+
+- Import it by modifying `mmseg/engine/optimizers/__init__.py`.
+  Modules should be imported in `mmseg/engine/optimizers/__init__.py` thus these new modules can be found and added by registry.
+
+```python
+from .my_optimizer_constructor import MyOptimizerConstructor
+```
+
+- Import it manually by `custom_imports` in config file.
+
+```python
+custom_imports = dict(imports=['mmseg.engine.optimizers.my_optimizer_constructor'], allow_failed_imports=False)
+```
+
+### Step 3: Modify config file
+
+Then it needs to modify `constructor` in `optim_wrapper` of config file, if users want to use customized `MyOptimizerConstructor`, it can be modified as:
+
+```python
+optim_wrapper = dict(type='OptimWrapper',
+                     constructor='MyOptimizerConstructor',
+                     clip_grad=None)
+```
diff --git a/docs/en/advanced_guides/data_flow.md b/docs/en/advanced_guides/data_flow.md
new file mode 100644
index 0000000000..404035aee4
--- /dev/null
+++ b/docs/en/advanced_guides/data_flow.md
@@ -0,0 +1,87 @@
+# Dataflow
+
+In this chapter, we will introduce the dataflow and data format convention between the internal modules managed by the [Runner](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html).
+
+## Overview of dataflow
+
+The [Runner](https://github.com/open-mmlab/mmengine/blob/main/docs/en/design/runner.md) is an "integrator" in MMEngine. It covers all aspects of the framework and shoulders the responsibility of organizing and scheduling nearly all modules, that means the dataflow between all modules also controlled by the `Runner`. As illustrated in the [Runner document of MMEngine](https://mmengine.readthedocs.io/en/latest/tutorials/runner.html), the following diagram shows the basic dataflow.
+
+![Basic dataflow](https://user-images.githubusercontent.com/112053249/199228350-5f80699e-7fd2-4b4c-ac32-0b16b1922c2e.png)
+
+The dashed border, gray filled shapes represent different data formats, while solid boxes represent modules/methods. Due to the great flexibility and extensibility of MMEngine, some critical base classes can be inherited and their methods can be overridden. The diagram above only holds when users are not customizing `TrainLoop`, `ValLoop`, and `TestLoop` in `Runner`, and are not overriding `train_step`, `val_step` and `test_step` method in their custom model. The default setting of loops in MMSegmentation is as follows, it uses `IterBasedTrainLoop` to train models with 20000 iterations in total and do evaluation each 2000 iterations.
+
+```python
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=20000, val_interval=2000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+```
+
+In the above diagram, the red line indicates the [train_step](./models.md#train_step). At each training iteration, dataloader loads images from storage and transfer to data preprocessor, data preprocessor would put images to the specific device and stack data to batch, then model accepts the batch data as inputs, finally the outputs of the model would be sent to optimizer. The blue line indicates [val_step](./models.md#val_step) and [test_step](./models.md#test_step). The dataflow of these two process is similar to the `train_step` except the outputs of model, since model parameters are freezed when doing evaluation, the model output would be transferred to [Evaluator](./evaluation.md#ioumetric) to compute metrics.
+
+## Dataflow convention in MMSegmentation
+
+From the diagram above, we could see the basic dataflow. In this section, we would introduce format convention of data involved in this dataflow, respectively.
+
+### DataLoader to Data Preprocessor
+
+DataLoader is an essential component in training and testing pipelines of MMEngine. Conceptually, it is derived from and consistent with [PyTorch](https://pytorch.org/). DataLoader loads data from filesystem and the original data passes through data preparation pipeline, then it would be sent to Data Preprocessor.
+
+MMSegmentation defines the default data format at [PackSegInputs](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/transforms/formatting.py#L12), it's the last component of `train_pipeline` and `test_pipeline`. Please refer to [data transform documentation](./transforms.md) for more information about data transform `pipeline`.
+
+Without any modifications, the return value of PackSegInputs is usually a `dict` and has only two keys, `inputs` and `data_samples`. The following pseudo-code shows the data types of the data loader output in mmseg, which is a batch of fetched data samples from the dataset, and data loader packs them into a dictionary of the list. `inputs` is the list of input tensors to the model and `data_samples` contains a list of input images' meta information and corresponding ground truth.
+
+```python
+dict(
+    inputs=List[torch.Tensor],
+    data_samples=List[SegDataSample]
+)
+```
+
+**Note:** [SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py) is a data structure interface of MMSegmentation, it is used as an interface between different components. `SegDataSample` implements the abstract data element `mmengine.structures.BaseDataElement`, please refer to [the SegDataSample documentation](./structures.md) and [data element documentation](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_element.html) in [MMEngine](https://github.com/open-mmlab/mmengine) for more information.
+
+### Data Preprocessor to Model
+
+Though drawn separately in the diagram [above](#overview-of-dataflow), data_preprocessor is a part of the model and thus can be found in [Model tutorial](./models.md) at data preprocessor chapter.
+
+The return value of data preprocessor is a dictionary, containing `inputs` and `data_samples`, `inputs` is batched images, a 4D tensor, and some additional meta info used in data preprocesses would be added to the `data_samples`. When transferred to the network, the dictionary would be unpacked to two values. The following pseudo-codes show the return value of the data preprocessor and the input values of model.
+
+```python
+dict(
+    inputs=torch.Tensor,
+    data_samples=List[SegDataSample]
+)
+```
+
+```python
+class Network(BaseSegmentor):
+
+    def forward(self, inputs: torch.Tensor, data_samples: List[SegDataSample], mode: str):
+        pass
+```
+
+**Note:** Model forward has 3 kinds of mode, which is controlled by input argumentmode, please refer [model tutorial](./models.md) for more details.
+
+### Model output
+
+As [model tutorial](./models.md#forward) mentioned 3 kinds of mode forward with 3 kinds of output. `train_step`and `test_step`(or `val_step`) correspond to `'loss'` and `'predict'` respectively.
+
+In `test_step` or `val_step`, the inference results would be transferred to `Evaluator`. You might read the [evaluation document](./evaluation.md) for more information about `Evaluator`.
+
+After inference, the [BaseSegmentor](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/segmentors/base.py#L15) in MMSegmentation would do a simple post process to pack inference results, the segmentation logits produced by the neural network, segmentation mask after the `argmax` operation and ground truth(if exists) would be packed into a similar `SegDataSample` instance. The return value of [postprocess_result](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/segmentors/base.py#L132) is a **`List` of `SegDataSample`**. Following diagram shows the key properties of these `SegDataSample` instances.
+
+![SegDataSample](https://user-images.githubusercontent.com/15952744/209912225-ab46a8d9-904a-43cb-8bf1-8bec4938ed29.png)
+
+The same as Data Preprocessor, loss function is also a part of the model, it's a property of [decode head](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/decode_heads/decode_head.py#L142).
+
+In MMSegmentation, the method [loss_by_feat](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/decode_heads/decode_head.py#L291) of `decode_head` is an unified interface used to compute loss.
+
+Parameters:
+
+- seg_logits (Tensor): The output from decode head forward function.
+- batch_data_samples (List\[:obj:`SegDataSample`\]): The seg data samples. It usually includes information such as `metainfo` and `gt_sem_seg`.
+
+Returns:
+
+- dict\[str, Tensor\]: a dictionary of loss components
+
+**Note:** The `train_step` transfers the loss into OptimWrapper to update the weights in model, please refer [train_step](./models.md#train_step) for more details.
diff --git a/docs/en/advanced_guides/datasets.md b/docs/en/advanced_guides/datasets.md
new file mode 100644
index 0000000000..1efc3346fd
--- /dev/null
+++ b/docs/en/advanced_guides/datasets.md
@@ -0,0 +1,386 @@
+# Dataset
+
+Dataset classes in MMSegmentation have two functions: (1) load data information after [data preparation](../user_guides/2_dataset_prepare.md)
+and (2) send data into [dataset transform pipeline](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/basesegdataset.py#L141) to do [data augmentation](./transforms.md).
+There are 2 kinds of loaded information: (1) meta information which is original dataset information such as categories (classes) of dataset and their corresponding palette information, (2) data information which includes
+the path of dataset images and labels.
+The tutorial includes some main interfaces in MMSegmentation 1.x dataset class: methods of loading data information and modifying dataset classes in base dataset class, and the relationship between dataset and the data transform pipeline.
+
+## Main Interfaces
+
+Take Cityscapes as an example, if you want to run the example, please download and [preprocess](../user_guides/2_dataset_prepare.md#cityscapes)
+Cityscapes dataset in `data` directory, before running the demo code:
+
+Instantiate Cityscapes training dataset:
+
+```python
+from mmseg.datasets import CityscapesDataset
+from mmengine.registry import init_default_scope
+init_default_scope('mmseg')
+
+data_root = 'data/cityscapes/'
+data_prefix=dict(img_path='leftImg8bit/train', seg_map_path='gtFine/train')
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='RandomCrop', crop_size=(512, 1024), cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackSegInputs')
+]
+
+dataset = CityscapesDataset(data_root=data_root, data_prefix=data_prefix, test_mode=False, pipeline=train_pipeline)
+```
+
+Get the length of training set:
+
+```python
+print(len(dataset))
+
+2975
+```
+
+Get data information: The type of data information is `dict` which includes several keys:
+
+- `'img_path'`: path of images
+- `'seg_map_path'`: path of segmentation labels
+- `'seg_fields'`: saving label fields
+- `'sample_idx'`: the  index of the current sample
+
+There are also `'label_map'` and `'reduce_zero_label'` whose functions would be introduced in the next section.
+
+```python
+# Acquire data information of first sample in dataset
+print(dataset.get_data_info(0))
+
+{'img_path': 'data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png',
+ 'seg_map_path': 'data/cityscapes/gtFine/train/aachen/aachen_000000_000019_gtFine_labelTrainIds.png',
+ 'label_map': None,
+ 'reduce_zero_label': False,
+ 'seg_fields': [],
+ 'sample_idx': 0}
+```
+
+Get dataset meta information: the type of MMSegmentation meta information is also `dict`, which includes `'classes'` field for dataset classes and `'palette'` field for corresponding colors in visualization, and has `'label_map'` field and `'reduce_zero_label'` filed.
+
+```python
+print(dataset.metainfo)
+
+{'classes': ('road',
+  'sidewalk',
+  'building',
+  'wall',
+  'fence',
+  'pole',
+  'traffic light',
+  'traffic sign',
+  'vegetation',
+  'terrain',
+  'sky',
+  'person',
+  'rider',
+  'car',
+  'truck',
+  'bus',
+  'train',
+  'motorcycle',
+  'bicycle'),
+ 'palette': [[128, 64, 128],
+  [244, 35, 232],
+  [70, 70, 70],
+  [102, 102, 156],
+  [190, 153, 153],
+  [153, 153, 153],
+  [250, 170, 30],
+  [220, 220, 0],
+  [107, 142, 35],
+  [152, 251, 152],
+  [70, 130, 180],
+  [220, 20, 60],
+  [255, 0, 0],
+  [0, 0, 142],
+  [0, 0, 70],
+  [0, 60, 100],
+  [0, 80, 100],
+  [0, 0, 230],
+  [119, 11, 32]],
+ 'label_map': None,
+ 'reduce_zero_label': False}
+```
+
+The return value of dataset `__getitem__` method is the output of data samples after data augmentation, whose type is also `dict`. It has two fields: `'inputs'` corresponding to images after data augmentation,
+and `'data_samples'` corresponding to [`SegDataSample`](./structures.md) which is new data structures in MMSegmentation 1.x,
+and `gt_sem_seg` of `SegDataSample` has labels after data augmentation operations.
+
+```python
+print(dataset[0])
+
+{'inputs': tensor([[[131, 130, 130,  ...,  23,  23,  23],
+          [132, 132, 132,  ...,  23,  22,  23],
+          [134, 133, 133,  ...,  23,  23,  23],
+          ...,
+          [ 66,  67,  67,  ...,  71,  71,  71],
+          [ 66,  67,  66,  ...,  68,  68,  68],
+          [ 67,  67,  66,  ...,  70,  70,  70]],
+
+         [[143, 143, 142,  ...,  28,  28,  29],
+          [145, 145, 145,  ...,  28,  28,  29],
+          [145, 145, 145,  ...,  27,  28,  29],
+          ...,
+          [ 75,  75,  76,  ...,  80,  81,  81],
+          [ 75,  76,  75,  ...,  80,  80,  80],
+          [ 77,  76,  76,  ...,  82,  82,  82]],
+
+         [[126, 125, 126,  ...,  21,  21,  22],
+          [127, 127, 128,  ...,  21,  21,  22],
+          [127, 127, 126,  ...,  21,  21,  22],
+          ...,
+          [ 63,  63,  64,  ...,  69,  69,  70],
+          [ 64,  65,  64,  ...,  69,  69,  69],
+          [ 65,  66,  66,  ...,  72,  71,  71]]], dtype=torch.uint8),
+ 'data_samples': <SegDataSample(
+
+     META INFORMATION
+     img_path: 'data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png'
+     seg_map_path: 'data/cityscapes/gtFine/train/aachen/aachen_000000_000019_gtFine_labelTrainIds.png'
+     img_shape: (512, 1024, 3)
+     flip_direction: None
+     ori_shape: (1024, 2048)
+     flip: False
+
+     DATA FIELDS
+     gt_sem_seg: <PixelData(
+
+             META INFORMATION
+
+             DATA FIELDS
+             data: tensor([[[2, 2, 2,  ..., 8, 8, 8],
+                          [2, 2, 2,  ..., 8, 8, 8],
+                          [2, 2, 2,  ..., 8, 8, 8],
+                          ...,
+                          [0, 0, 0,  ..., 0, 0, 0],
+                          [0, 0, 0,  ..., 0, 0, 0],
+                          [0, 0, 0,  ..., 0, 0, 0]]])
+         )>
+     _gt_sem_seg: <PixelData(
+
+             META INFORMATION
+
+             DATA FIELDS
+             data: tensor([[[2, 2, 2,  ..., 8, 8, 8],
+                          [2, 2, 2,  ..., 8, 8, 8],
+                          [2, 2, 2,  ..., 8, 8, 8],
+                          ...,
+                          [0, 0, 0,  ..., 0, 0, 0],
+                          [0, 0, 0,  ..., 0, 0, 0],
+                          [0, 0, 0,  ..., 0, 0, 0]]])
+         )>
+ )}
+```
+
+## BaseSegDataset
+
+As mentioned above, dataset classes have the same functions, we implemented  [`BaseSegDataset`](https://mmsegmentation.readthedocs.io/en/latest/api.html?highlight=BaseSegDataset#mmseg.datasets.BaseSegDataset) to reues the common functions.
+It inherits [`BaseDataset` of MMEngine](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/basedataset.md) and follows unified initialization process of OpenMMLab. It supports the highly effective interior storing format, some functions like
+dataset concatenation and repeatedly sampling. In MMSegmentation `BaseSegDataset`, the **method of loading data information** (`load_data_list`) is redefined and adds new `get_label_map` method to **modify dataset classes information**.
+
+### Loading Dataset Information
+
+The loaded data information includes the path of images samples and annotations samples, the detailed implementation could be found in
+[`load_data_list`](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/basesegdataset.py#L231) of `BaseSegDataset` in MMSegmentation.
+There are two main methods to acquire the path of images and labels:
+
+1. Load file paths according to the dirictory and suffix of input images and annotations
+
+If the dataset directory structure is organized as below, the [`load_data_list`](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/basesegdataset.py#L231) can parse dataset directory Structure:
+
+```
+├── data
+│   ├── my_dataset
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   │   ├── xxx{img_suffix}
+│   │   │   │   ├── yyy{img_suffix}
+│   │   │   ├── val
+│   │   │   │   ├── zzz{img_suffix}
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   │   ├── xxx{seg_map_suffix}
+│   │   │   │   ├── yyy{seg_map_suffix}
+│   │   │   ├── val
+│   │   │   │   ├── zzz{seg_map_suffix}
+```
+
+Here is an example pf ADE20K, and below the directory structure of the dataset:
+
+```
+├── ade
+│   ├── ADEChallengeData2016
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   │   ├── ADE_train_00000001.png
+│   │   │   │   ├── ...
+│   │   │   │── validation
+│   │   │   │   ├── ADE_val_00000001.png
+│   │   │   │   ├── ...
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   │   ├── ADE_train_00000001.jpg
+│   │   │   │   ├── ...
+│   │   │   ├── validation
+│   │   │   │   ├── ADE_val_00000001.jpg
+│   │   │   │   ├── ...
+```
+
+```python
+from mmseg.datasets import ADE20KDataset
+
+ADE20KDataset(data_root = 'data/ade/ADEChallengeData2016',
+    data_prefix=dict(img_path='images/training', seg_map_path='annotations/training'),
+    img_suffix='.jpg',
+    seg_map_suffix='.png',
+    reduce_zero_label=True)
+```
+
+2. Load file paths from annotation file
+
+Dataset also can load an annotation file which includes the data sample paths of dataset.
+Take PascalContext dataset instance as an example, its input annotation file is:
+
+```python
+2008_000008
+...
+```
+
+It needs to define `ann_file` when instantiation:
+
+```python
+PascalContextDataset(data_root='data/VOCdevkit/VOC2010/',
+    data_prefix=dict(img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
+    ann_file='ImageSets/SegmentationContext/train.txt')
+```
+
+### Modification of Dataset Classes
+
+- Use `metainfo` input argument
+
+Meta information is defined as class variables, such as `METAINFO` variable of Cityscapes:
+
+```python
+class CityscapesDataset(BaseSegDataset):
+    """Cityscapes dataset.
+
+    The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is
+    fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset.
+    """
+    METAINFO = dict(
+        classes=('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+                 'traffic light', 'traffic sign', 'vegetation', 'terrain',
+                 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train',
+                 'motorcycle', 'bicycle'),
+        palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+                 [190, 153, 153], [153, 153, 153], [250, 170,
+                                                    30], [220, 220, 0],
+                 [107, 142, 35], [152, 251, 152], [70, 130, 180],
+                 [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                 [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
+
+```
+
+Here `'classes'` defines class names of Cityscapes dataset annotations, if users only concern some classes about vehicles and **ignore other classes**,
+the meta information of dataset could be modified by defined input argument `metainfo` when instantiating Cityscapes dataset:
+
+```python
+from mmseg.datasets import CityscapesDataset
+
+data_root = 'data/cityscapes/'
+data_prefix=dict(img_path='leftImg8bit/train', seg_map_path='gtFine/train')
+# metainfo only keep classes below:
+metainfo=dict(classes=( 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle'))
+dataset = CityscapesDataset(data_root=data_root, data_prefix=data_prefix, metainfo=metainfo)
+
+print(dataset.metainfo)
+
+{'classes': ('car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle'),
+ 'palette': [[0, 0, 142],
+  [0, 0, 70],
+  [0, 60, 100],
+  [0, 80, 100],
+  [0, 0, 230],
+  [119, 11, 32],
+  [128, 64, 128],
+  [244, 35, 232],
+  [70, 70, 70],
+  [102, 102, 156],
+  [190, 153, 153],
+  [153, 153, 153],
+  [250, 170, 30],
+  [220, 220, 0],
+  [107, 142, 35],
+  [152, 251, 152],
+  [70, 130, 180],
+  [220, 20, 60],
+  [255, 0, 0]],
+ # pixels whose label index are 255 would be ignored when calculating loss
+ 'label_map': {0: 255,
+  1: 255,
+  2: 255,
+  3: 255,
+  4: 255,
+  5: 255,
+  6: 255,
+  7: 255,
+  8: 255,
+  9: 255,
+  10: 255,
+  11: 255,
+  12: 255,
+  13: 0,
+  14: 1,
+  15: 2,
+  16: 3,
+  17: 4,
+  18: 5},
+ 'reduce_zero_label': False}
+```
+
+Meta information is different from default setting of Cityscapes dataset. Moreover, `label_map` field is also defined, which is used for modifying label index of each pixel on segmentation mask.
+The segmentation label would re-map class information by `label_map`, [here](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/basesegdataset.py#L151) is detailed implementation:
+
+```python
+gt_semantic_seg_copy = gt_semantic_seg.copy()
+for old_id, new_id in results['label_map'].items():
+    gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
+```
+
+- Using `reduce_zero_label` input argument
+
+To ignore label 0 (such as ADE20K dataset), we can use `reduce_zero_label` (default to `False`) argument of BaseSegDataset and its subclasses.
+When `reduce_zero_label` is `True`, label 0 in segmentation annotations would be set as 255 (models of MMSegmentation would ignore label 255 in calculating loss) and indices of other labels will minus 1:
+
+```python
+gt_semantic_seg[gt_semantic_seg == 0] = 255
+gt_semantic_seg = gt_semantic_seg - 1
+gt_semantic_seg[gt_semantic_seg == 254] = 255
+```
+
+## Dataset and Data Transform Pipeline
+
+If the argument `pipeline` is defined, the return value of `__getitem__` method is after data argument.
+If dataset input argument does not define pipeline, it is the same as return value of `get_data_info` method.
+
+```python
+from mmseg.datasets import CityscapesDataset
+
+data_root = 'data/cityscapes/'
+data_prefix=dict(img_path='leftImg8bit/train', seg_map_path='gtFine/train')
+dataset = CityscapesDataset(data_root=data_root, data_prefix=data_prefix, test_mode=False)
+
+print(dataset[0])
+
+{'img_path': 'data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png',
+ 'seg_map_path': 'data/cityscapes/gtFine/train/aachen/aachen_000000_000019_gtFine_labelTrainIds.png',
+ 'label_map': None,
+ 'reduce_zero_label': False,
+ 'seg_fields': [],
+ 'sample_idx': 0}
+```
diff --git a/docs/en/advanced_guides/engine.md b/docs/en/advanced_guides/engine.md
new file mode 100644
index 0000000000..7acfe5ad64
--- /dev/null
+++ b/docs/en/advanced_guides/engine.md
@@ -0,0 +1,279 @@
+# Training Engine
+
+MMEngine defined some [basic loop controllers](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py) such as epoch-based training loop (`EpochBasedTrainLoop`), iteration-based training loop (`IterBasedTrainLoop`), standard validation loop (`ValLoop`), and standard testing loop (`TestLoop`).
+
+OpenMMLab's algorithm libraries like MMSegmentation abstract model training, testing, and inference as `Runner` to handle. Users can use the default `Runner` in MMEngine directly or modify the `Runner` to meet customized needs. This document mainly introduces how users can configure existing running settings, hooks, and optimizers' basic concepts and usage methods.
+
+## Configuring Runtime Settings
+
+### Configuring Training Iterations
+
+Loop controllers refer to the execution process during training, validation, and testing. `train_cfg`, `val_cfg`, and `test_cfg` are used to build these processes in the configuration file. MMSegmentation sets commonly used training iterations in `train_cfg` under the `configs/_base_/schedules` folder.
+For example, to train for 80,000 iterations using the iteration-based training loop (`IterBasedTrainLoop`) and perform validation every 8,000 iterations, you can set it as follows:
+
+```python
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=8000)
+```
+
+### Configuring Training Optimizers
+
+Here's an example of a SGD optimizer:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
+    clip_grad=None)
+```
+
+OpenMMLab supports all optimizers in PyTorch. For more details, please refer to the [MMEngine optimizer documentation](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/optim_wrapper.md).
+
+It is worth emphasizing that `optim_wrapper` is a variable of `runner`, so when configuring the optimizer, the field to configure is the `optim_wrapper` field. For more information on using optimizers, see the [Optimizer](#Optimizer) section below.
+
+### Configuring Training Parameter Schedulers
+
+Before configuring the training parameter scheduler, it is recommended to first understand the basic concepts of parameter schedulers in the [MMEngine documentation](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md).
+
+Here's an example of a parameter scheduler. During training, a linearly changing learning rate strategy is used for warm-up in the first 1,000 iterations. After the first 1,000 iterations until the 16,000 iterations in the end, the default polynomial learning rate decay is used:
+
+```python
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+```
+
+Note: When modifying the `max_iters` in `train_cfg`, make sure the parameters in the parameter scheduler `param_scheduler` are also modified accordingly.
+
+## Hook
+
+### Introduction
+
+OpenMMLab abstracts the model training and testing process as `Runner`. Inserting hooks can implement the corresponding functionality needed at different training and testing stages (such as "before and after each training iter", "before and after each validation iter", etc.) in `Runner`. For more introduction on hook mechanisms, please refer to [here](https://www.calltutors.com/blog/what-is-hook).
+
+Hooks used in `Runner` are divided into two categories:
+
+- Default hooks:
+
+They implement essential functions during training and are defined in the configuration file by `default_hooks` and passed to `Runner`. `Runner` registers them through the [`register_default_hooks`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py#L1780) method.
+
+Hooks have corresponding priorities; the higher the priority, the earlier the runner calls them. If the priorities are the same, the calling order is consistent with the hook registration order.
+
+It is not recommended for users to modify the default hook priorities. Please refer to the [MMEngine hooks documentation](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/hook.md) to understand the hook priority definitions.
+
+The following are the default hooks used in MMSegmentation:
+
+|                                                          Hook                                                          |                                                          Function                                                          |     Priority      |
+| :--------------------------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------------------------------------: | :---------------: |
+|          [IterTimerHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/iter_timer_hook.py)           |                                          Record the time spent on each iteration.                                          |    NORMAL (50)    |
+|              [LoggerHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py)              | Collect log records from different components in `Runner` and output them to terminal, JSON file, tensorboard, wandb, etc. | BELOW_NORMAL (60) |
+|     [ParamSchedulerHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/param_scheduler_hook.py)      |                       Update some hyperparameters in the optimizer, such as learning rate momentum.                        |     LOW (70)      |
+|          [CheckpointHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py)          |                                              Regularly save checkpoint files.                                              |   VERY_LOW (90)   |
+|      [DistSamplerSeedHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sampler_seed_hook.py)       |                                     Ensure the distributed sampler shuffle is enabled.                                     |    NORMAL (50)    |
+| [SegVisualizationHook](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/visualization/local_visualizer.py) |                                Visualize prediction results during validation and testing.                                 |    NORMAL (50)    |
+
+MMSegmentation registers some hooks with essential training functions in `default_hooks`:
+
+```python
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=32000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+```
+
+All the default hooks mentioned above, except for `SegVisualizationHook`, are implemented in MMEngine. The `SegVisualizationHook` is a hook implemented in MMSegmentation, which will be introduced later.
+
+- Modifying default hooks
+
+We will use the `logger` and `checkpoint` in `default_hooks` as examples to demonstrate how to modify the default hooks in `default_hooks`.
+
+(1) Model saving configuration
+
+`default_hooks` uses the `checkpoint` field to initialize the [model saving hook (CheckpointHook)](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L19).
+
+```python
+checkpoint = dict(type='CheckpointHook', interval=1)
+```
+
+Users can set `max_keep_ckpts` to save only a small number of checkpoints or use `save_optimizer` to determine whether to save optimizer information. More details on related parameters can be found [here](https://mmengine.readthedocs.io/en/latest/api/generated/mmengine.hooks.CheckpointHook.html#checkpointhook).
+
+(2) Logging configuration
+
+The `LoggerHook` is used to collect log information from different components in `Runner` and write it to terminal, JSON files, tensorboard, wandb, etc.
+
+```python
+logger=dict(type='LoggerHook', interval=10)
+```
+
+In the latest 1.x version of MMSegmentation, some logger hooks (LoggerHook) such as `TextLoggerHook`, `WandbLoggerHook`, and `TensorboardLoggerHook` will no longer be used. Instead, MMEngine uses `LogProcessor` to handle the information processed by the aforementioned hooks, which are now in [`MessageHub`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/logging/message_hub.py#L17), [`WandbVisBackend`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py#L324), and [`TensorboardVisBackend`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py#L472).
+
+Detailed usage is as follows, configuring the visualizer and specifying the visualization backend at the same time, here using Tensorboard as the visualizer's backend:
+
+```python
+# TensorboardVisBackend
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=[dict(type='TensorboardVisBackend')], name='visualizer')
+```
+
+For more related usage, please refer to [MMEngine Visualization Backend User Tutorial](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/visualization.md).
+
+- Custom hooks
+
+Custom hooks are defined in the configuration through `custom_hooks`, and `Runner` registers them using the [`register_custom_hooks`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py#L1820) method.
+
+The priority of custom hooks needs to be set in the configuration file; if not, it will be set to `NORMAL` by default. The following are some custom hooks implemented in MMEngine:
+
+|                                                  Hook                                                  |                                                               Usage                                                                |
+| :----------------------------------------------------------------------------------------------------: | :--------------------------------------------------------------------------------------------------------------------------------: |
+|         [EMAHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/ema_hook.py)         |                                    Use Exponential Moving Average (EMA) during model training.                                     |
+| [EmptyCacheHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/empty_cache_hook.py)  |                                  Release all GPU memory not occupied by the cache during training                                  |
+| [SyncBuffersHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sync_buffer_hook.py) | Synchronize the parameters in the model buffer, such as `running_mean` and `running_var` in BN, at the end of each training epoch. |
+
+The following is a use case for `EMAHook`, where the config file includes the configuration of the implemented custom hooks as members of the `custom_hooks` list.
+
+```python
+custom_hooks = [
+    dict(type='EMAHook', start_iters=500, priority='NORMAL')
+]
+```
+
+### SegVisualizationHook
+
+MMSegmentation implemented [`SegVisualizationHook`](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/engine/hooks/visualization_hook.py#L17), which is used to visualize prediction results during validation and testing.
+`SegVisualizationHook` overrides the `_after_iter` method in the base class `Hook`. During validation or testing, it calls the `add_datasample` method of `visualizer` to draw semantic segmentation results according to the specified iteration interval. The specific implementation is as follows:
+
+```python
+...
+@HOOKS.register_module()
+class SegVisualizationHook(Hook):
+...
+    def _after_iter(self,
+                    runner: Runner,
+                    batch_idx: int,
+                    data_batch: dict,
+                    outputs: Sequence[SegDataSample],
+                    mode: str = 'val') -> None:
+...
+        # If it's a training phase or self.draw is False, then skip it
+        if self.draw is False or mode == 'train':
+            return
+...
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            for output in outputs:
+                img_path = output.img_path
+                img_bytes = self.file_client.get(img_path)
+                img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+                window_name = f'{mode}_{osp.basename(img_path)}'
+
+                self._visualizer.add_datasample(
+                    window_name,
+                    img,
+                    data_sample=output,
+                    show=self.show,
+                    wait_time=self.wait_time,
+                    step=runner.iter)
+
+```
+
+For more details about visualization, you can check [here](../user_guides/visualization.md).
+
+## Optimizer
+
+In the previous configuration and runtime settings, we provided a simple example of configuring the training optimizer. This section will further detailly introduce  how to configure optimizers in MMSegmentation.
+
+## Optimizer Wrapper
+
+OpenMMLab 2.0 introduces an optimizer wrapper that supports different training strategies, including mixed-precision training, gradient accumulation, and gradient clipping. Users can choose the appropriate training strategy according to their needs. The optimizer wrapper also defines a standard parameter update process, allowing users to switch between different training strategies within the same code. For more information, please refer to the [MMEngine optimizer wrapper documentation](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/optim_wrapper.md).
+
+Here are some common usage methods in MMSegmentation:
+
+#### Configuring PyTorch Supported Optimizers
+
+OpenMMLab 2.0 supports all native PyTorch optimizers, as referenced [here](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/optim_wrapper.md).
+
+To set the optimizer used by the `Runner` during training in the configuration file, you need to define `optim_wrapper` instead of `optimizer`. Below is an example of configuring an optimizer during training:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
+    clip_grad=None)
+```
+
+#### Configuring Gradient Clipping
+
+When the model training requires gradient clipping, you can configure it as shown in the following example:
+
+```python
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer,
+                        clip_grad=dict(max_norm=0.01, norm_type=2))
+```
+
+Here, `max_norm` refers to the maximum value of the gradient after clipping, and `norm_type` refers to the norm used when clipping the gradient. Related methods can be found in [torch.nn.utils.clip_grad_norm\_](https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html).
+
+#### Configuring Mixed Precision Training
+
+When mixed precision training is needed to reduce memory usage, you can use `AmpOptimWrapper`. The specific configuration is as follows:
+
+```python
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='AmpOptimWrapper', optimizer=optimizer)
+```
+
+The default setting for `loss_scale` in [`AmpOptimWrapper`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py#L20) is `dynamic`.
+
+#### Configuring Hyperparameters for Different Layers of the Model Network
+
+In model training, if you want to set different optimization strategies for different parameters in the optimizer, such as setting different learning rates, weight decay, and other hyperparameters, you can achieve this by setting `paramwise_cfg` in the `optim_wrapper` of the configuration file.
+
+The following config file uses the [ViT `optim_wrapper`](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py#L15-L27) as an example to introduce the use of `paramwise_cfg` parameters. During training, the weight decay parameter coefficients for the `pos_embed`, `mask_token`, and `norm` modules are set to 0. That is, during training, the weight decay for these modules will be changed to `weight_decay * decay_mult`=0.
+
+```python
+optimizer = dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+```
+
+Here, `decay_mult` refers to the weight decay coefficient for the corresponding parameters. For more information on the usage of `paramwise_cfg`, please refer to the [MMEngine optimizer wrapper documentation](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/optim_wrapper.md).
+
+### Optimizer Wrapper Constructor
+
+The default optimizer wrapper constructor [`DefaultOptimWrapperConstructor`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L19) builds the optimizer used in training based on the input `optim_wrapper` and `paramwise_cfg` defined in the `optim_wrapper`. When the functionality of [`DefaultOptimWrapperConstructor`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L19) does not meet the requirements, you can customize the optimizer wrapper constructor to implement the configuration of hyperparameters.
+
+MMSegmentation has implemented the [`LearningRateDecayOptimizerConstructor`](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py#L104), which can decay the learning rate of model parameters in the backbone networks of ConvNeXt, BEiT, and MAE models during training according to the defined decay ratio (`decay_rate`). The configuration in the configuration file is as follows:
+
+```python
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+```
+
+The purpose of `_delete_=True` is to ignore the inherited configuration in the OpenMMLab Config. In this code snippet, the inherited `optim_wrapper` configuration is ignored. For more information on `_delete_` fields, please refer to the [MMEngine documentation](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/config.md#delete-key-in-dict).
diff --git a/docs/en/advanced_guides/evaluation.md b/docs/en/advanced_guides/evaluation.md
new file mode 100644
index 0000000000..ca0beeeccf
--- /dev/null
+++ b/docs/en/advanced_guides/evaluation.md
@@ -0,0 +1,155 @@
+# Evaluation
+
+The evaluation procedure would be executed at [ValLoop](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L300) and [TestLoop](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L373), users can evaluate model performance during training or using the test script with simple settings in the configuration file. The `ValLoop` and `TestLoop` are properties of [Runner](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py#L59), they will be built the first time they are called. To build the `ValLoop` successfully, the `val_dataloader` and `val_evaluator` must be set when building `Runner` since `dataloader` and `evaluator` are required parameters, and the same goes for `TestLoop`. For more information about the Runner's design, please refer to the [documentation](https://github.com/open-mmlab/mmengine/blob/main/docs/en/design/runner.md) of [MMEngine](https://github.com/open-mmlab/mmengine).
+
+![test_step/val_step dataflow](https://user-images.githubusercontent.com/15952744/228828179-3269baa3-bebd-4c9a-9787-59e7d785fbcf.png)
+
+In MMSegmentation, we write the settings of dataloader and metrics in the config files of datasets and the configuration of the evaluation loop in the `schedule_x` config files by default.
+
+For example, in the ADE20K config file `configs/_base_/dataset/ade20k.py`, on lines 37 to 48, we configured the `val_dataloader`, on line 51, we select `IoUMetric` as the evaluator and set `mIoU` as the metric:
+
+```python
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+```
+
+To be able to evaluate the model during training, for example, we add the evaluation configuration to the file `configs/schedules/schedule_40k.py` on lines 15 to 16:
+
+```python
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=40000, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+```
+
+With the above two settings, MMSegmentation evaluates the **mIoU** metric of the model once every 4000 iterations during the training of 40K iterations.
+
+If we would like to test the model after training, we need to add the `test_dataloader`, `test_evaluator` and `test_cfg` configs to the config file.
+
+```python
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_cfg = dict(type='TestLoop')
+```
+
+In MMSegmentation, the settings of `test_dataloader` and `test_evaluator` are the same as the `ValLoop`'s dataloader and evaluator by default, we can modify these settings to meet our needs.
+
+## IoUMetric
+
+MMSegmentation implements [IoUMetric](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/evaluation/metrics/iou_metric.py) and [CityscapesMetric](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/evaluation/metrics/citys_metric.py) for evaluating the performance of models, based on the [BaseMetric](https://github.com/open-mmlab/mmengine/blob/main/mmengine/evaluator/metric.py) provided by [MMEngine](https://github.com/open-mmlab/mmengine). Please refer to [the documentation](https://mmengine.readthedocs.io/en/latest/tutorials/evaluation.html) for more details about the unified evaluation interface.
+
+Here we briefly describe the arguments and the two main methods of `IoUMetric`.
+
+The constructor of `IoUMetric` has some additional parameters besides the base `collect_device` and `prefix`.
+
+The arguments of the constructor:
+
+- ignore_index (int) - Index that will be ignored in evaluation. Default: 255.
+- iou_metrics (list\[str\] | str) - Metrics to be calculated, the options includes 'mIoU', 'mDice' and 'mFscore'.
+- nan_to_num (int, optional) - If specified, NaN values will be replaced by the numbers defined by the user. Default: None.
+- beta (int) - Determines the weight of recall in the combined score. Default: 1.
+- collect_device (str) - Device name used for collecting results from different ranks during distributed training. Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
+- prefix (str, optional) - The prefix that will be added in the metric names to disambiguate homonymous metrics of different evaluators. If the prefix is not provided in the argument, self.default_prefix will be used instead. Defaults to None.
+
+`IoUMetric` implements the IoU metric calculation, the core two methods of `IoUMetric` are `process` and `compute_metrics`.
+
+- `process` method processes one batch of data and data_samples.
+- `compute_metrics` method computes the metrics from processed results.
+
+### IoUMetric.process
+
+Parameters:
+
+- data_batch (Any) - A batch of data from the dataloader.
+- data_samples (Sequence\[dict\]) - A batch of outputs from the model.
+
+Returns:
+
+This method doesn't have returns since the processed results would be stored in `self.results`, which will be used to compute the metrics when all batches have been processed.
+
+### IoUMetric.compute_metrics
+
+Parameters:
+
+- results (list) - The processed results of each batch.
+
+Returns:
+
+- Dict\[str, float\] - The computed metrics. The keys are the names of the metrics, and the values are corresponding results. The key mainly includes **aAcc**, **mIoU**, **mAcc**, **mDice**, **mFscore**, **mPrecision**, **mRecall**.
+
+## CityscapesMetric
+
+`CityscapesMetric` uses the official [CityscapesScripts](https://github.com/mcordts/cityscapesScripts) provided by Cityscapes to evaluate model performance.
+
+### Usage
+
+Before using it, please install the `cityscapesscripts` package first:
+
+```shell
+pip install cityscapesscripts
+```
+
+Since the `IoUMetric` is used as the default evaluator in MMSegmentation, if you would like to use `CityscapesMetric`, customizing the config file is required. In your customized config file, you should overwrite the default evaluator as follows.
+
+```python
+val_evaluator = dict(type='CityscapesMetric', output_dir='tmp')
+test_evaluator = val_evaluator
+```
+
+### Interface
+
+The arguments of the constructor:
+
+- output_dir (str) - The directory for output prediction
+- ignore_index (int) - Index that will be ignored in evaluation. Default: 255.
+- format_only (bool) - Only format result for results commit without perform evaluation. It is useful when you want to format the result to a specific format and submit it to the test server. Defaults to False.
+- keep_results (bool) - Whether to keep the results. When `format_only` is True, `keep_results` must be True. Defaults to False.
+- collect_device (str) - Device name used for collecting results from different ranks during distributed training. Must be 'cpu' or 'gpu'. Defaults to 'cpu'.
+- prefix (str, optional) - The prefix that will be added in the metric names to disambiguate homonymous metrics of different evaluators. If prefix is not provided in the argument, self.default_prefix will be used instead. Defaults to None.
+
+#### CityscapesMetric.process
+
+This method would draw the masks on images and save the painted images to `work_dir`.
+
+Parameters:
+
+- data_batch (dict) - A batch of data from the dataloader.
+- data_samples (Sequence\[dict\]) - A batch of outputs from the model.
+
+Returns:
+
+This method doesn't have returns, the annotations' path would be stored in `self.results`, which will be used to compute the metrics when all batches have been processed.
+
+#### CityscapesMetric.compute_metrics
+
+This method would call `cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling` tool to calculate metrics.
+
+Parameters:
+
+- results (list) - Testing results of the dataset.
+
+Returns:
+
+- dict\[str: float\] - Cityscapes evaluation results.
diff --git a/docs/en/advanced_guides/index.rst b/docs/en/advanced_guides/index.rst
new file mode 100644
index 0000000000..53ef8c5e73
--- /dev/null
+++ b/docs/en/advanced_guides/index.rst
@@ -0,0 +1,26 @@
+Basic Concepts
+***************
+
+.. toctree::
+   :maxdepth: 1
+
+   data_flow.md
+   structures.md
+   models.md
+   datasets.md
+   transforms.md
+   evaluation.md
+   engine.md
+   training_tricks.md
+
+Component Customization
+************************
+
+.. toctree::
+   :maxdepth: 1
+
+   add_models.md
+   add_datasets.md
+   add_transforms.md
+   add_metrics.md
+   customize_runtime.md
diff --git a/docs/en/advanced_guides/models.md b/docs/en/advanced_guides/models.md
new file mode 100644
index 0000000000..b0089869d9
--- /dev/null
+++ b/docs/en/advanced_guides/models.md
@@ -0,0 +1,164 @@
+# Models
+
+We usually define a neural network in a deep learning task as a model, and this model is the core of an algorithm. [MMEngine](https://github.com/open-mmlab/mmengine) abstracts a unified model [BaseModel](https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_model/base_model.py#L16) to standardize the interfaces for training, testing and other processes. All models implemented by MMSegmentation inherit from `BaseModel`, and in MMSegmentation we implemented forward and added some functions for the semantic segmentation algorithm.
+
+## Common components
+
+### Segmentor
+
+In MMSegmentation, we abstract the network architecture as a **Segmentor**, it is a model that contains all components of a network. We have already implemented **EncoderDecoder** and **CascadeEncoderDecoder**, which typically consist of **Data preprocessor**, **Backbone**, **Decode head** and **Auxiliary head**.
+
+### Data preprocessor
+
+**Data preprocessor** is the part that copies data to the target device and preprocesses the data into the model input format.
+
+### Backbone
+
+**Backbone** is the part that transforms an image to feature maps, such as a **ResNet-50** without the last fully connected layer.
+
+### Neck
+
+**Neck** is the part that connects the backbone and heads. It performs some refinements or reconfigurations on the raw feature maps produced by the backbone. An example is **Feature Pyramid Network (FPN)**.
+
+### Decode head
+
+**Decode head** is the part that transforms the feature maps into a segmentation mask, such as **PSPNet**.
+
+### Auxiliary head
+
+**Auxiliary head** is an optional component that transforms the feature maps into segmentation masks which only used for computing auxiliary losses.
+
+## Basic interfaces
+
+MMSegmentation wraps `BaseModel` and implements the [BaseSegmentor](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/models/segmentors/base.py#L15) class, which mainly provides the interfaces `forward`, `train_step`, `val_step` and `test_step`. The following will introduce these interfaces in detail.
+
+### forward
+
+![EncoderDecoder dataflow](https://user-images.githubusercontent.com/15952744/228827860-c0e34875-d370-4736-84f0-9560c26c9576.png)
+![CascadeEncoderDecoder dataflow](https://user-images.githubusercontent.com/15952744/228827987-aa214507-0c6d-4a08-8ce4-679b2b200b79.png)
+
+The `forward` method returns losses or predictions of training, validation, testing, and a simple inference process.
+
+The method should accept three modes: "tensor", "predict" and "loss":
+
+- "tensor": Forward the whole network and return the tensor or tuple of tensor without any post-processing, same as a common `nn.Module`.
+- "predict": Forward and return the predictions, which are fully processed to a list of `SegDataSample`.
+- "loss": Forward and return a `dict` of losses according to the given inputs and data samples.
+
+**Note:** [SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py) is a data structure interface of MMSegmentation, it is used as an interface between different components. `SegDataSample` implements the abstract data element `mmengine.structures.BaseDataElement`, please refer to [the SegDataSample documentation](https://mmsegmentation.readthedocs.io/en/1.x/advanced_guides/structures.html) and [data element documentation](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_element.html) in [MMEngine](https://github.com/open-mmlab/mmengine) for more information.
+
+Note that this method doesn't handle either backpropagation or optimizer updating, which are done in the method `train_step`.
+
+Parameters:
+
+- inputs (torch.Tensor) - The input tensor with shape (N, C, ...) in general.
+- data_sample (list\[[SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py)\]) - The seg data samples. It usually includes information such as `metainfo` and `gt_sem_seg`. Default to None.
+- mode (str) - Return what kind of value. Defaults to 'tensor'.
+
+Returns:
+
+- `dict` or `list`:
+  - If `mode == "loss"`, return a `dict` of loss tensor used for backward and logging.
+  - If `mode == "predict"`, return a `list` of `SegDataSample`, the inference results will be incrementally added to the `data_sample` parameter passed to the forward method, each `SegDataSample` contains the following keys:
+    - pred_sem_seg (`PixelData`): Prediction of semantic segmentation.
+    - seg_logits (`PixelData`): Predicted logits of semantic segmentation before normalization.
+  - If `mode == "tensor"`, return a `tensor` or `tuple of tensor` or `dict` of `tensor` for custom use.
+
+### prediction modes
+
+We briefly describe the fields of the model's configuration in [the config documentation](../user_guides/1_config.md), here we elaborate on the `model.test_cfg` field. `model.test_cfg` is used to control forward behavior, the `forward` method in `"predict"` mode can run in two modes:
+
+- `whole_inference`: If `cfg.model.test_cfg.mode == 'whole'`, model will inference with full images.
+
+  An `whole_inference` mode example config:
+
+  ```python
+  model = dict(
+    type='EncoderDecoder'
+    ...
+    test_cfg=dict(mode='whole')
+  )
+  ```
+
+- `slide_inference`: If `cfg.model.test_cfg.mode == 'slide'`, model will inference by sliding-window. **Note:** if you select the `slide` mode, `cfg.model.test_cfg.stride` and `cfg.model.test_cfg.crop_size` should also be specified.
+
+  An `slide_inference` mode example config:
+
+  ```python
+  model = dict(
+    type='EncoderDecoder'
+    ...
+    test_cfg=dict(mode='slide', crop_size=256, stride=170)
+  )
+  ```
+
+### train_step
+
+The `train_step` method calls the forward interface of the `loss` mode to get the loss `dict`. The `BaseModel` class implements the default model training process including preprocessing, model forward propagation, loss calculation, optimization, and back-propagation.
+
+Parameters:
+
+- data (dict or tuple or list) - Data sampled from the dataset. In MMSegmentation, the data dict contains `inputs` and `data_samples` two fields.
+- optim_wrapper (OptimWrapper) - OptimWrapper instance used to update model parameters.
+
+**Note:** [OptimWrapper](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/optimizer_wrapper.py#L17) provides a common interface for updating parameters, please refer to optimizer wrapper [documentation](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html) in [MMEngine](https://github.com/open-mmlab/mmengine) for more information.
+
+Returns:
+
+- Dict\[str, `torch.Tensor`\]: A `dict` of tensor for logging.
+
+![train_step dataflow](https://user-images.githubusercontent.com/15952744/228828089-a9ae1225-958d-4cf7-99af-9af8576f7ef7.png)
+
+### val_step
+
+The `val_step` method calls the forward interface of the `predict` mode and returns the prediction result, which is further passed to the process interface of the evaluator and the `after_val_iter` interface of the Hook.
+
+Parameters:
+
+- data (`dict` or `tuple` or `list`) - Data sampled from the dataset. In MMSegmentation, the data dict contains `inputs` and `data_samples` two fields.
+
+Returns:
+
+- `list` - The predictions of given data.
+
+![test_step/val_step dataflow](https://user-images.githubusercontent.com/15952744/228828179-3269baa3-bebd-4c9a-9787-59e7d785fbcf.png)
+
+### test_step
+
+The `BaseModel` implements `test_step` the same as `val_step`.
+
+## Data Preprocessor
+
+The [SegDataPreProcessor](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/models/data_preprocessor.py#L13) implemented by MMSegmentation inherits from the [BaseDataPreprocessor](https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_model/data_preprocessor.py#L18) implemented by [MMEngine](https://github.com/open-mmlab/mmengine) and provides the functions of data preprocessing and copying data to the target device.
+
+The runner carries the model to the specified device during the construction stage, while the data is carried to the specified device by the [SegDataPreProcessor](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/models/data_preprocessor.py#L13) in `train_step`, `val_step`, and `test_step`, and the processed data is further passed to the model.
+
+The parameters of the `SegDataPreProcessor` constructor:
+
+- mean (Sequence\[Number\], optional) - The pixel mean of R, G, B channels. Defaults to None.
+- std (Sequence\[Number\], optional) - The pixel standard deviation of R, G, B channels. Defaults to None.
+- size (tuple, optional) - Fixed padding size.
+- size_divisor (int, optional) - The divisor of padded size.
+- pad_val (float, optional) - Padding value. Default: 0.
+- seg_pad_val (float, optional) - Padding value of segmentation map. Default: 255.
+- bgr_to_rgb (bool) - whether to convert image from BGR to RGB. Defaults to False.
+- rgb_to_bgr (bool) - whether to convert image from RGB to BGR. Defaults to False.
+- batch_augments (list\[dict\], optional) - Batch-level augmentations. Default to None.
+
+The data will be processed as follows:
+
+- Collate and move data to the target device.
+- Pad inputs to the input size with defined `pad_val`, and pad seg map with defined `seg_pad_val`.
+- Stack inputs to batch_inputs.
+- Convert inputs from bgr to rgb if the shape of input is (3, H, W).
+- Normalize image with defined std and mean.
+- Do batch augmentations like Mixup and Cutmix during training.
+
+The parameters of the `forward` method:
+
+- data (dict) - data sampled from dataloader.
+- training (bool) - Whether to enable training time augmentation.
+
+The returns of the `forward` method:
+
+- Dict: Data in the same format as the model input.
diff --git a/docs/en/advanced_guides/structures.md b/docs/en/advanced_guides/structures.md
new file mode 100644
index 0000000000..2607242e23
--- /dev/null
+++ b/docs/en/advanced_guides/structures.md
@@ -0,0 +1,104 @@
+# Structures
+
+To unify input and output interfaces  between different models and modules, OpenMMLab 2.0 MMEngine defines an abstract data structure,
+it has implemented basic functions of `Create`, `Read`, `Update`, `Delete`, supported data transferring among different types of devices
+and tensor-like or dictionary-like operations such as `.cpu()`, `.cuda()`, `.get()` and `.detach()`.
+More details can be found [here](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/data_element.md).
+
+MMSegmentation also follows this interface protocol and defines `SegDataSample` which is used to encapsulate the data of semantic segmentation task.
+
+## Semantic Segmentation Data SegDataSample
+
+[SegDataSample](mmseg.structures.SegDataSample) includes three main fields `gt_sem_seg`, `pred_sem_seg` and `seg_logits`, which are used to store the annotation information and prediction results respectively.
+
+| Field          | Type                      | Description                                |
+| -------------- | ------------------------- | ------------------------------------------ |
+| gt_sem_seg     | [`PixelData`](#pixeldata) | Annotation information.                    |
+| pred_instances | [`PixelData`](#pixeldata) | The predicted result.                      |
+| seg_logits     | [`PixelData`](#pixeldata) | The raw (non-normalized) predicted result. |
+
+The following sample code demonstrates the use of `SegDataSample`.
+
+```python
+import torch
+from mmengine.structures import PixelData
+from mmseg.structures import SegDataSample
+
+img_meta = dict(img_shape=(4, 4, 3),
+                 pad_shape=(4, 4, 3))
+data_sample = SegDataSample()
+# defining gt_segmentations for encapsulate the ground truth data
+gt_segmentations = PixelData(metainfo=img_meta)
+gt_segmentations.data = torch.randint(0, 2, (1, 4, 4))
+
+# add and process property in SegDataSample
+data_sample.gt_sem_seg = gt_segmentations
+assert 'gt_sem_seg' in data_sample
+assert 'sem_seg' in data_sample.gt_sem_seg
+assert 'img_shape' in data_sample.gt_sem_seg.metainfo_keys()
+print(data_sample.gt_sem_seg.shape)
+'''
+(4, 4)
+'''
+print(data_sample)
+'''
+<SegDataSample(
+
+    META INFORMATION
+
+    DATA FIELDS
+    gt_sem_seg: <PixelData(
+
+            META INFORMATION
+            img_shape: (4, 4, 3)
+            pad_shape: (4, 4, 3)
+
+            DATA FIELDS
+            data: tensor([[[1, 1, 1, 0],
+                         [1, 0, 1, 1],
+                         [1, 1, 1, 1],
+                         [0, 1, 0, 1]]])
+        ) at 0x1c2b4156460>
+) at 0x1c2aae44d60>
+'''
+
+# delete and change property in SegDataSample
+data_sample = SegDataSample()
+gt_segmentations = PixelData(metainfo=img_meta)
+gt_segmentations.data = torch.randint(0, 2, (1, 4, 4))
+data_sample.gt_sem_seg = gt_segmentations
+data_sample.gt_sem_seg.set_metainfo(dict(img_shape=(4,4,9), pad_shape=(4,4,9)))
+del data_sample.gt_sem_seg.img_shape
+
+# Tensor-like operations
+data_sample = SegDataSample()
+gt_segmentations = PixelData(metainfo=img_meta)
+gt_segmentations.data = torch.randint(0, 2, (1, 4, 4))
+cuda_gt_segmentations = gt_segmentations.cuda()
+cuda_gt_segmentations = gt_segmentations.to('cuda:0')
+cpu_gt_segmentations = cuda_gt_segmentations.cpu()
+cpu_gt_segmentations = cuda_gt_segmentations.to('cpu')
+```
+
+## Customize New Property in SegDataSample
+
+If you want to customize new property in `SegDataSample`, you may follow [SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py) below:
+
+```python
+class SegDataSample(BaseDataElement):
+    ...
+
+    @property
+    def xxx_property(self) -> xxxData:
+        return self._xxx_property
+
+    @xxx_property.setter
+    def xxx_property(self, value: xxxData) -> None:
+        self.set_field(value, '_xxx_property', dtype=xxxData)
+
+    @xxx_property.deleter
+    def xxx_property(self) -> None:
+        del self._xxx_property
+```
+
+Then a new property would be added to `SegDataSample`.
diff --git a/docs/en/advanced_guides/training_tricks.md b/docs/en/advanced_guides/training_tricks.md
new file mode 100644
index 0000000000..bc4f72257d
--- /dev/null
+++ b/docs/en/advanced_guides/training_tricks.md
@@ -0,0 +1,75 @@
+# Training Tricks
+
+MMSegmentation support following training tricks out of box.
+
+## Different Learning Rate(LR) for Backbone and Heads
+
+In semantic segmentation, some methods make the LR of heads larger than backbone to achieve better performance or faster convergence.
+
+In MMSegmentation, you may add following lines to config to make the LR of heads 10 times of backbone.
+
+```python
+optim_wrapper=dict(
+    paramwise_cfg = dict(
+        custom_keys={
+            'head': dict(lr_mult=10.)}))
+```
+
+With this modification, the LR of any parameter group with `'head'` in name will be multiplied by 10.
+You may refer to [MMEngine documentation](https://mmengine.readthedocs.io/en/latest/tutorials/optim_wrapper.html#advanced-usages) for further details.
+
+## Online Hard Example Mining (OHEM)
+
+We implement pixel sampler for training sampling, like OHEM (Online Hard Example Mining),
+which is used for remove the "easy" examples for model training.
+Here is an example config of training PSPNet with OHEM enabled.
+
+```python
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model=dict(
+    decode_head=dict(
+        sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=100000)) )
+```
+
+In this way, only pixels with confidence score under 0.7 are used to train. And we keep at least 100000 pixels during training. If `thresh` is not specified, pixels of top `min_kept` loss will be selected.
+
+## Class Balanced Loss
+
+For dataset that is not balanced in classes distribution, you may change the loss weight of each class.
+Here is an example for cityscapes dataset.
+
+```python
+_base_ = './pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+model=dict(
+    decode_head=dict(
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0,
+            # DeepLab used this class weight for cityscapes
+            class_weight=[0.8373, 0.9180, 0.8660, 1.0345, 1.0166, 0.9969, 0.9754,
+                        1.0489, 0.8786, 1.0023, 0.9539, 0.9843, 1.1116, 0.9037,
+                        1.0865, 1.0955, 1.0865, 1.1529, 1.0507])))
+```
+
+`class_weight` will be passed into `CrossEntropyLoss` as `weight` argument. Please refer to [PyTorch Doc](https://pytorch.org/docs/stable/nn.html?highlight=crossentropy#torch.nn.CrossEntropyLoss) for details.
+
+## Multiple Losses
+
+For loss calculation, we support multiple losses training concurrently. Here is an example config of training `unet` on `DRIVE` dataset, whose loss function is `1:3` weighted sum of `CrossEntropyLoss` and `DiceLoss`:
+
+```python
+_base_ = './fcn_unet_s5-d16_64x64_40k_drive.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]),
+    auxiliary_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]),
+)
+```
+
+In this way, `loss_weight` and `loss_name` will be weight and name in training log of corresponding loss, respectively.
+
+Note: If you want this loss item to be included into the backward graph, `loss_` must be the prefix of the name.
diff --git a/docs/en/advanced_guides/transforms.md b/docs/en/advanced_guides/transforms.md
new file mode 100644
index 0000000000..68b1f44bd3
--- /dev/null
+++ b/docs/en/advanced_guides/transforms.md
@@ -0,0 +1,119 @@
+# Data Transforms
+
+In this tutorial, we introduce the design of transforms pipeline in MMSegmentation.
+
+The structure of this guide is as follows:
+
+- [Data Transforms](#data-transforms)
+  - [Design of Data pipelines](#design-of-data-pipelines)
+    - [Data loading](#data-loading)
+    - [Pre-processing](#pre-processing)
+    - [Formatting](#formatting)
+
+## Design of Data pipelines
+
+Following typical conventions, we use `Dataset` and `DataLoader` for data loading with multiple workers. `Dataset` returns a dict of data items corresponding the arguments of models' forward method. Since the data in semantic segmentation may not be the same size, we introduce a new `DataContainer` type in MMCV to help collect and distribute data of different size. See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
+
+In 1.x version of MMSegmentation, all data transformations are inherited from [`BaseTransform`](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/transforms/base.py#L6).
+
+The input and output types of transformations are both dict. A simple example is as follows:
+
+```python
+>>> from mmseg.datasets.transforms import LoadAnnotations
+>>> transforms = LoadAnnotations()
+>>> img_path = './data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png.png'
+>>> gt_path = './data/cityscapes/gtFine/train/aachen/aachen_000015_000019_gtFine_instanceTrainIds.png'
+>>> results = dict(
+>>>     img_path=img_path,
+>>>     seg_map_path=gt_path,
+>>>     reduce_zero_label=False,
+>>>     seg_fields=[])
+>>> data_dict = transforms(results)
+>>> print(data_dict.keys())
+dict_keys(['img_path', 'seg_map_path', 'reduce_zero_label', 'seg_fields', 'gt_seg_map'])
+```
+
+The data preparation pipeline and the dataset are decomposed. Usually a dataset defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict. A pipeline consists of a sequence of operations. Each operation takes a dict as input and also outputs a dict for the next transform.
+
+The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.
+
+Here is a pipeline example for PSPNet:
+
+```python
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+```
+
+For each operation, we list the related dict fields that are `added`/`updated`/`removed`. Before pipelines, the information we can directly obtain from the datasets are `img_path` and `seg_map_path`.
+
+### Data loading
+
+`LoadImageFromFile`: Load an image from file.
+
+- add: `img`, `img_shape`, `ori_shape`
+
+`LoadAnnotations`: Load semantic segmentation maps provided by dataset.
+
+- add: `seg_fields`, `gt_seg_map`
+
+### Pre-processing
+
+`RandomResize`: Random resize image & segmentation map.
+
+- add: `scale`, `scale_factor`, `keep_ratio`
+- update: `img`, `img_shape`, `gt_seg_map`
+
+`Resize`: Resize image & segmentation map.
+
+- add: `scale`, `scale_factor`, `keep_ratio`
+- update: `img`, `gt_seg_map`, `img_shape`
+
+`RandomCrop`: Random crop image & segmentation map.
+
+- update: `img`, `gt_seg_map`, `img_shape`
+
+`RandomFlip`: Flip the image & segmentation map.
+
+- add: `flip`, `flip_direction`
+- update: `img`, `gt_seg_map`
+
+`PhotoMetricDistortion`: Apply photometric distortion to image sequentially, every transformation is applied with a probability of 0.5. The position of random contrast is in second or second to last(mode 0 or 1 below, respectively).
+
+```
+1. random brightness
+2. random contrast (mode 0)
+3. convert color from BGR to HSV
+4. random saturation
+5. random hue
+6. convert color from HSV to BGR
+7. random contrast (mode 1)
+```
+
+- update: `img`
+
+### Formatting
+
+`PackSegInputs`: Pack the inputs data for the semantic segmentation.
+
+- add: `inputs`, `data_sample`
+- remove: keys specified by `meta_keys` (merged into the metainfo of data_sample), all other keys
diff --git a/docs/en/api.rst b/docs/en/api.rst
index 8285841dc6..2f1a25ef9d 100644
--- a/docs/en/api.rst
+++ b/docs/en/api.rst
@@ -3,56 +3,93 @@ mmseg.apis
 .. automodule:: mmseg.apis
     :members:
 
-mmseg.core
+mmseg.datasets
 --------------
 
-seg
-^^^^^^^^^^
-.. automodule:: mmseg.core.seg
-    :members:
-
-evaluation
+datasets
 ^^^^^^^^^^
-.. automodule:: mmseg.core.evaluation
+.. automodule:: mmseg.datasets
     :members:
 
-utils
-^^^^^^^^^^
-.. automodule:: mmseg.core.utils
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmseg.datasets.transforms
     :members:
 
-mmseg.datasets
+mmseg.engine
 --------------
 
-datasets
+hooks
 ^^^^^^^^^^
-.. automodule:: mmseg.datasets
+.. automodule:: mmseg.engine.hooks
+    :members:
+
+optimizers
+^^^^^^^^^^^^^^^
+.. automodule:: mmseg.engine.optimizers
     :members:
 
-pipelines
+mmseg.evaluation
+-----------------
+
+metrics
 ^^^^^^^^^^
-.. automodule:: mmseg.datasets.pipelines
+.. automodule:: mmseg.evaluation.metrics
     :members:
 
 mmseg.models
 --------------
 
+backbones
+^^^^^^^^^^^^^^^^^^
+.. automodule:: mmseg.models.backbones
+    :members:
+
+decode_heads
+^^^^^^^^^^^^^^^
+.. automodule:: mmseg.models.decode_heads
+    :members:
+
 segmentors
 ^^^^^^^^^^
 .. automodule:: mmseg.models.segmentors
     :members:
 
-backbones
+losses
 ^^^^^^^^^^
-.. automodule:: mmseg.models.backbones
+.. automodule:: mmseg.models.losses
     :members:
 
-decode_heads
+necks
 ^^^^^^^^^^^^
-.. automodule:: mmseg.models.decode_heads
+.. automodule:: mmseg.models.necks
     :members:
 
-losses
+utils
 ^^^^^^^^^^
-.. automodule:: mmseg.models.losses
+.. automodule:: mmseg.models.utils
+    :members:
+
+
+mmseg.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmseg.structures
+    :members:
+
+sampler
+^^^^^^^^^^
+.. automodule:: mmseg.structures.sampler
+    :members:
+
+mmseg.visualization
+--------------------
+.. automodule:: mmseg.visualization
+    :members:
+
+mmseg.utils
+--------------
+.. automodule:: mmseg.utils
     :members:
diff --git a/docs/en/changelog.md b/docs/en/changelog.md
deleted file mode 100644
index dc94fbdf2e..0000000000
--- a/docs/en/changelog.md
+++ /dev/null
@@ -1,720 +0,0 @@
-## Changelog
-
-### V0.24.1 (5/1/2022)
-
-**Bug Fixes**
-
-- Fix `LayerDecayOptimizerConstructor` for MAE training ([#1539](https://github.com/open-mmlab/mmsegmentation/pull/1539), [#1540](https://github.com/open-mmlab/mmsegmentation/pull/1540))
-
-### V0.24.0 (4/29/2022)
-
-**Highlights**
-
-- Support MAE: Masked Autoencoders Are Scalable Vision Learners
-- Support Resnet strikes back
-
-**New Features**
-
-- Support MAE: Masked Autoencoders Are Scalable Vision Learners ([1307](https://github.com/open-mmlab/mmsegmentation/pull/1307), [1523](https://github.com/open-mmlab/mmsegmentation/pull/1523))
-- Support Resnet strikes back ([1390](https://github.com/open-mmlab/mmsegmentation/pull/1390))
-- Support extra dataloader settings in configs ([1435](https://github.com/open-mmlab/mmsegmentation/pull/1435))
-
-**Bug Fixes**
-
-- Fix input previous results for the last cascade_decode_head ([#1450](https://github.com/open-mmlab/mmsegmentation/pull/1450))
-- Fix validation loss logging ([#1494](https://github.com/open-mmlab/mmsegmentation/pull/1494))
-- Fix the bug in binary_cross_entropy ([1527](https://github.com/open-mmlab/mmsegmentation/pull/1527))
-- Support single channel prediction for Binary Cross Entropy Loss ([#1454](https://github.com/open-mmlab/mmsegmentation/pull/1454))
-- Fix potential bugs in accuracy.py ([1496](https://github.com/open-mmlab/mmsegmentation/pull/1496))
-- Avoid converting label ids twice by label map during evaluation ([1417](https://github.com/open-mmlab/mmsegmentation/pull/1417))
-- Fix bug about label_map ([1445](https://github.com/open-mmlab/mmsegmentation/pull/1445))
-- Fix image save path bug in Windows ([1423](https://github.com/open-mmlab/mmsegmentation/pull/1423))
-- Fix MMSegmentation Colab demo ([1501](https://github.com/open-mmlab/mmsegmentation/pull/1501), [1452](https://github.com/open-mmlab/mmsegmentation/pull/1452))
-- Migrate azure blob for beit checkpoints ([1503](https://github.com/open-mmlab/mmsegmentation/pull/1503))
-- Fix bug in `tools/analyse_logs.py` caused by wrong plot_iter in some cases ([1428](https://github.com/open-mmlab/mmsegmentation/pull/1428))
-
-**Improvements**
-
-- Merge BEiT and ConvNext's LR decay optimizer constructors ([#1438](https://github.com/open-mmlab/mmsegmentation/pull/1438))
-- Register optimizer constructor with mmseg ([#1456](https://github.com/open-mmlab/mmsegmentation/pull/1456))
-- Refactor transformer encode layer in ViT and BEiT backbone ([#1481](https://github.com/open-mmlab/mmsegmentation/pull/1481))
-- Add `build_pos_embed` and `build_layers` for BEiT ([1517](https://github.com/open-mmlab/mmsegmentation/pull/1517))
-- Add `with_cp` to mit and vit ([1431](https://github.com/open-mmlab/mmsegmentation/pull/1431))
-- Fix inconsistent dtype of `seg_label` in stdc decode ([1463](https://github.com/open-mmlab/mmsegmentation/pull/1463))
-- Delete random seed for training in `dist_train.sh` ([1519](https://github.com/open-mmlab/mmsegmentation/pull/1519))
-- Revise high `workers_per_gpus` in config file ([#1506](https://github.com/open-mmlab/mmsegmentation/pull/1506))
-- Add GPG keys and del mmcv version in Dockerfile ([1534](https://github.com/open-mmlab/mmsegmentation/pull/1534))
-- Update checkpoint for model in deeplabv3plus ([#1487](https://github.com/open-mmlab/mmsegmentation/pull/1487))
-- Add `DistSamplerSeedHook` to set epoch number to dataloader when runner is `EpochBasedRunner` ([1449](https://github.com/open-mmlab/mmsegmentation/pull/1449))
-- Provide URLs of Swin Transformer pretrained models ([1389](https://github.com/open-mmlab/mmsegmentation/pull/1389))
-- Updating Dockerfiles From Docker Directory and `get_started.md` to reach latest stable version of Python, PyTorch and MMCV ([1446](https://github.com/open-mmlab/mmsegmentation/pull/1446))
-
-**Documentation**
-
-- Add more clearly statement of CPU training/inference ([1518](https://github.com/open-mmlab/mmsegmentation/pull/1518))
-
-**Contributors**
-
-- @jiangyitong made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1431
-- @kahkeng made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1447
-- @Nourollah made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1446
-- @androbaza made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1452
-- @Yzichen made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1445
-- @whu-pzhang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1423
-- @panfeng-hover made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1417
-- @Johnson-Wang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1496
-- @jere357 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1460
-- @mfernezir made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1494
-- @donglixp made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1503
-- @YuanLiuuuuuu made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1307
-- @Dawn-bin made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1527
-
-### V0.23.0 (4/1/2022)
-
-**Highlights**
-
-- Support BEiT: BERT Pre-Training of Image Transformers
-- Support K-Net: Towards Unified Image Segmentation
-- Add `avg_non_ignore` of CELoss to support average loss over non-ignored elements
-- Support dataset initialization with file client
-
-**New Features**
-
-- Support BEiT: BERT Pre-Training of Image Transformers ([#1404](https://github.com/open-mmlab/mmsegmentation/pull/1404))
-- Support K-Net: Towards Unified Image Segmentation ([#1289](https://github.com/open-mmlab/mmsegmentation/pull/1289))
-- Support dataset initialization with file client ([#1402](https://github.com/open-mmlab/mmsegmentation/pull/1402))
-- Add class name function for STARE datasets ([#1376](https://github.com/open-mmlab/mmsegmentation/pull/1376))
-- Support different seeds on different ranks when distributed training ([#1362](https://github.com/open-mmlab/mmsegmentation/pull/1362))
-- Add `nlc2nchw2nlc` and `nchw2nlc2nchw` to simplify tensor with different dimension operation ([#1249](https://github.com/open-mmlab/mmsegmentation/pull/1249))
-
-**Improvements**
-
-- Synchronize random seed for distributed sampler ([#1411](https://github.com/open-mmlab/mmsegmentation/pull/1411))
-- Add script and documentation for multi-machine distributed training ([#1383](https://github.com/open-mmlab/mmsegmentation/pull/1383))
-
-**Bug Fixes**
-
-- Add `avg_non_ignore` of CELoss to support average loss over non-ignored elements ([#1409](https://github.com/open-mmlab/mmsegmentation/pull/1409))
-- Fix some wrong URLs of models or logs in `./configs` ([#1336](https://github.com/open-mmlab/mmsegmentation/pull/1433))
-- Add title and color theme arguments to plot function in `tools/confusion_matrix.py` ([#1401](https://github.com/open-mmlab/mmsegmentation/pull/1401))
-- Fix outdated link in Colab demo ([#1392](https://github.com/open-mmlab/mmsegmentation/pull/1392))
-- Fix typos ([#1424](https://github.com/open-mmlab/mmsegmentation/pull/1424), [#1405](https://github.com/open-mmlab/mmsegmentation/pull/1405), [#1371](https://github.com/open-mmlab/mmsegmentation/pull/1371), [#1366](https://github.com/open-mmlab/mmsegmentation/pull/1366), [#1363](https://github.com/open-mmlab/mmsegmentation/pull/1363))
-
-**Documentation**
-
-- Add FAQ document ([#1420](https://github.com/open-mmlab/mmsegmentation/pull/1420))
-- Fix the config name style description in official docs([#1414](https://github.com/open-mmlab/mmsegmentation/pull/1414))
-
-**Contributors**
-
-- @kinglintianxia made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1371
-- @CCODING04 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1376
-- @mob5566 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1401
-- @xiongnemo made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1392
-- @Xiangxu-0103 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1405
-
-### V0.22.1 (3/9/2022)
-
-**Bug Fixes**
-
-- Fix the ZeroDivisionError that all pixels in one image is ignored. ([#1336](https://github.com/open-mmlab/mmsegmentation/pull/1336))
-
-**Improvements**
-
-- Provide URLs of STDC, Segmenter and Twins pretrained models ([#1272](https://github.com/open-mmlab/mmsegmentation/pull/1357))
-
-### V0.22 (3/04/2022)
-
-**Highlights**
-
-- Support ConvNeXt: A ConvNet for the 2020s. Please use the latest MMClassification (0.21.0) to try it out.
-- Support iSAID aerial Dataset.
-- Officially Support inference on Windows OS.
-
-**New Features**
-
-- Support ConvNeXt: A ConvNet for the 2020s. ([#1216](https://github.com/open-mmlab/mmsegmentation/pull/1216))
-- Support iSAID aerial Dataset. ([#1115](https://github.com/open-mmlab/mmsegmentation/pull/1115)
-- Generating and plotting confusion matrix. ([#1301](https://github.com/open-mmlab/mmsegmentation/pull/1301))
-
-**Improvements**
-
-- Refactor 4 decoder heads (ASPP, FCN, PSP, UPer): Split forward function into `_forward_feature` and `cls_seg`. ([#1299](https://github.com/open-mmlab/mmsegmentation/pull/1299))
-- Add `min_size` arg in `Resize` to keep the shape after resize bigger than slide window. ([#1318](https://github.com/open-mmlab/mmsegmentation/pull/1318))
-- Revise pre-commit-hooks. ([#1315](https://github.com/open-mmlab/mmsegmentation/pull/1315))
-- Add win-ci. ([#1296](https://github.com/open-mmlab/mmsegmentation/pull/1296))
-
-**Bug Fixes**
-
-- Fix `mlp_ratio` type in Swin Transformer. ([#1274](https://github.com/open-mmlab/mmsegmentation/pull/1274))
-- Fix path errors in `./demo` . ([#1269](https://github.com/open-mmlab/mmsegmentation/pull/1269))
-- Fix bug in conversion of potsdam. ([#1279](https://github.com/open-mmlab/mmsegmentation/pull/1279))
-- Make accuracy take into account `ignore_index`. ([#1259](https://github.com/open-mmlab/mmsegmentation/pull/1259))
-- Add Pytorch HardSwish assertion in unit test. ([#1294](https://github.com/open-mmlab/mmsegmentation/pull/1294))
-- Fix wrong palette value in vaihingen. ([#1292](https://github.com/open-mmlab/mmsegmentation/pull/1292))
-- Fix the bug that SETR cannot load pretrain. ([#1293](https://github.com/open-mmlab/mmsegmentation/pull/1293))
-- Update correct `In Collection` in metafile of each configs. ([#1239](https://github.com/open-mmlab/mmsegmentation/pull/1239))
-- Upload completed STDC models. ([#1332](https://github.com/open-mmlab/mmsegmentation/pull/1332))
-- Fix `DNLHead` exports onnx inference difference type Cast error. ([#1161](https://github.com/open-mmlab/mmsegmentation/pull/1332))
-
-**Contributors**
-
-- @JiaYanhao made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1269
-- @andife made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1281
-- @SBCV made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1279
-- @HJoonKwon made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1259
-- @Tsingularity made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1290
-- @Waterman0524 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1115
-- @MeowZheng made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1315
-- @linfangjian01 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1318
-
-### V0.21.1 (2/9/2022)
-
-**Bug Fixes**
-
-- Fix typos in docs. ([#1263](https://github.com/open-mmlab/mmsegmentation/pull/1263))
-- Fix repeating log by `setup_multi_processes`. ([#1267](https://github.com/open-mmlab/mmsegmentation/pull/1267))
-- Upgrade isort in pre-commit hook. ([#1270](https://github.com/open-mmlab/mmsegmentation/pull/1270))
-
-**Improvements**
-
-- Use MMCV load_state_dict func in ViT/Swin. ([#1272](https://github.com/open-mmlab/mmsegmentation/pull/1272))
-- Add exception for PointRend for support CPU-only. ([#1271](https://github.com/open-mmlab/mmsegmentation/pull/1270))
-
-### V0.21 (1/29/2022)
-
-**Highlights**
-
-- Officially Support CPUs training and inference, please use the latest MMCV (1.4.4) to try it out.
-- Support Segmenter: Transformer for Semantic Segmentation (ICCV'2021).
-- Support ISPRS Potsdam and Vaihingen Dataset.
-- Add Mosaic transform and `MultiImageMixDataset` class in `dataset_wrappers`.
-
-**New Features**
-
-- Support Segmenter: Transformer for Semantic Segmentation (ICCV'2021) ([#955](https://github.com/open-mmlab/mmsegmentation/pull/955))
-- Support ISPRS Potsdam and Vaihingen Dataset ([#1097](https://github.com/open-mmlab/mmsegmentation/pull/1097), [#1171](https://github.com/open-mmlab/mmsegmentation/pull/1171))
-- Add segformer‘s benchmark on cityscapes ([#1155](https://github.com/open-mmlab/mmsegmentation/pull/1155))
-- Add auto resume ([#1172](https://github.com/open-mmlab/mmsegmentation/pull/1172))
-- Add Mosaic transform and `MultiImageMixDataset` class in `dataset_wrappers` ([#1093](https://github.com/open-mmlab/mmsegmentation/pull/1093), [#1105](https://github.com/open-mmlab/mmsegmentation/pull/1105))
-- Add log collector ([#1175](https://github.com/open-mmlab/mmsegmentation/pull/1175))
-
-**Improvements**
-
-- New-style CPU training and inference ([#1251](https://github.com/open-mmlab/mmsegmentation/pull/1251))
-- Add UNet benchmark with multiple losses supervision ([#1143](https://github.com/open-mmlab/mmsegmentation/pull/1143))
-
-**Bug Fixes**
-
-- Fix the model statistics in doc for readthedoc ([#1153](https://github.com/open-mmlab/mmsegmentation/pull/1153))
-- Set random seed for `palette` if not given ([#1152](https://github.com/open-mmlab/mmsegmentation/pull/1152))
-- Add `COCOStuffDataset` in `class_names.py` ([#1222](https://github.com/open-mmlab/mmsegmentation/pull/1222))
-- Fix bug in non-distributed multi-gpu training/testing ([#1247](https://github.com/open-mmlab/mmsegmentation/pull/1247))
-- Delete unnecessary lines of STDCHead ([#1231](https://github.com/open-mmlab/mmsegmentation/pull/1231))
-
-**Contributors**
-
-- @jbwang1997 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1152
-- @BeaverCC made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1206
-- @Echo-minn made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1214
-- @rstrudel made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/955
-
-### V0.20.2 (12/15/2021)
-
-**Bug Fixes**
-
-- Revise --option to --options to avoid BC-breaking. ([#1140](https://github.com/open-mmlab/mmsegmentation/pull/1140))
-
-### V0.20.1 (12/14/2021)
-
-**Improvements**
-
-- Change options to cfg-options ([#1129](https://github.com/open-mmlab/mmsegmentation/pull/1129))
-
-**Bug Fixes**
-
-- Fix `<!-- [ABSTRACT] -->` in metafile. ([#1127](https://github.com/open-mmlab/mmsegmentation/pull/1127))
-- Fix correct `num_classes` of HRNet in `LoveDA` dataset ([#1136](https://github.com/open-mmlab/mmsegmentation/pull/1136))
-
-### V0.20 (12/10/2021)
-
-**Highlights**
-
-- Support Twins ([#989](https://github.com/open-mmlab/mmsegmentation/pull/989))
-- Support a real-time segmentation model STDC ([#995](https://github.com/open-mmlab/mmsegmentation/pull/995))
-- Support a widely-used segmentation model in lane detection ERFNet ([#960](https://github.com/open-mmlab/mmsegmentation/pull/960))
-- Support A Remote Sensing Land-Cover Dataset LoveDA ([#1028](https://github.com/open-mmlab/mmsegmentation/pull/1028))
-- Support focal loss ([#1024](https://github.com/open-mmlab/mmsegmentation/pull/1024))
-
-**New Features**
-
-- Support Twins ([#989](https://github.com/open-mmlab/mmsegmentation/pull/989))
-- Support a real-time segmentation model STDC ([#995](https://github.com/open-mmlab/mmsegmentation/pull/995))
-- Support a widely-used segmentation model in lane detection ERFNet ([#960](https://github.com/open-mmlab/mmsegmentation/pull/960))
-- Add SETR cityscapes benchmark ([#1087](https://github.com/open-mmlab/mmsegmentation/pull/1087))
-- Add BiSeNetV1 COCO-Stuff 164k benchmark ([#1019](https://github.com/open-mmlab/mmsegmentation/pull/1019))
-- Support focal loss ([#1024](https://github.com/open-mmlab/mmsegmentation/pull/1024))
-- Add Cutout transform ([#1022](https://github.com/open-mmlab/mmsegmentation/pull/1022))
-
-**Improvements**
-
-- Set a random seed when the user does not set a seed ([#1039](https://github.com/open-mmlab/mmsegmentation/pull/1039))
-- Add CircleCI setup ([#1086](https://github.com/open-mmlab/mmsegmentation/pull/1086))
-- Skip CI on ignoring given paths ([#1078](https://github.com/open-mmlab/mmsegmentation/pull/1078))
-- Add abstract and image for every paper ([#1060](https://github.com/open-mmlab/mmsegmentation/pull/1060))
-- Create a symbolic link on windows ([#1090](https://github.com/open-mmlab/mmsegmentation/pull/1090))
-- Support video demo using trained model ([#1014](https://github.com/open-mmlab/mmsegmentation/pull/1014))
-
-**Bug Fixes**
-
-- Fix incorrectly loading init_cfg or pretrained models of several transformer models ([#999](https://github.com/open-mmlab/mmsegmentation/pull/999), [#1069](https://github.com/open-mmlab/mmsegmentation/pull/1069), [#1102](https://github.com/open-mmlab/mmsegmentation/pull/1102))
-- Fix EfficientMultiheadAttention in SegFormer ([#1037](https://github.com/open-mmlab/mmsegmentation/pull/1037))
-- Remove `fp16` folder in `configs` ([#1031](https://github.com/open-mmlab/mmsegmentation/pull/1031))
-- Fix several typos in .yml file (Dice Metric [#1041](https://github.com/open-mmlab/mmsegmentation/pull/1041), ADE20K dataset [#1120](https://github.com/open-mmlab/mmsegmentation/pull/1120), Training Memory (GB) [#1083](https://github.com/open-mmlab/mmsegmentation/pull/1083))
-- Fix test error when using `--show-dir` ([#1091](https://github.com/open-mmlab/mmsegmentation/pull/1091))
-- Fix dist training infinite waiting issue ([#1035](https://github.com/open-mmlab/mmsegmentation/pull/1035))
-- Change the upper version of mmcv to 1.5.0 ([#1096](https://github.com/open-mmlab/mmsegmentation/pull/1096))
-- Fix symlink failure on Windows ([#1038](https://github.com/open-mmlab/mmsegmentation/pull/1038))
-- Cancel previous runs that are not completed ([#1118](https://github.com/open-mmlab/mmsegmentation/pull/1118))
-- Unified links of readthedocs in docs ([#1119](https://github.com/open-mmlab/mmsegmentation/pull/1119))
-
-**Contributors**
-
-- @Junjue-Wang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1028
-- @ddebby made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1066
-- @del-zhenwu made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1078
-- @KangBK0120 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1106
-- @zergzzlun made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1091
-- @fingertap made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1035
-- @irvingzhang0512 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1014
-- @littleSunlxy made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/989
-- @lkm2835
-- @RockeyCoss
-- @MengzhangLI
-- @Junjun2016
-- @xiexinch
-- @xvjiarui
-
-### V0.19 (11/02/2021)
-
-**Highlights**
-
-- Support TIMMBackbone wrapper ([#998](https://github.com/open-mmlab/mmsegmentation/pull/998))
-- Support custom hook ([#428](https://github.com/open-mmlab/mmsegmentation/pull/428))
-- Add codespell pre-commit hook ([#920](https://github.com/open-mmlab/mmsegmentation/pull/920))
-- Add FastFCN benchmark on ADE20K ([#972](https://github.com/open-mmlab/mmsegmentation/pull/972))
-
-**New Features**
-
-- Support TIMMBackbone wrapper ([#998](https://github.com/open-mmlab/mmsegmentation/pull/998))
-- Support custom hook ([#428](https://github.com/open-mmlab/mmsegmentation/pull/428))
-- Add FastFCN benchmark on ADE20K ([#972](https://github.com/open-mmlab/mmsegmentation/pull/972))
-- Add codespell pre-commit hook and fix typos ([#920](https://github.com/open-mmlab/mmsegmentation/pull/920))
-
-**Improvements**
-
-- Make inputs & channels smaller in unittests ([#1004](https://github.com/open-mmlab/mmsegmentation/pull/1004))
-- Change `self.loss_decode` back to `dict` in Single Loss situation ([#1002](https://github.com/open-mmlab/mmsegmentation/pull/1002))
-
-**Bug Fixes**
-
-- Fix typo in usage example ([#1003](https://github.com/open-mmlab/mmsegmentation/pull/1003))
-- Add contiguous after permutation in ViT ([#992](https://github.com/open-mmlab/mmsegmentation/pull/992))
-- Fix the invalid link ([#985](https://github.com/open-mmlab/mmsegmentation/pull/985))
-- Fix bug in CI with python 3.9 ([#994](https://github.com/open-mmlab/mmsegmentation/pull/994))
-- Fix bug when loading class name form file in custom dataset ([#923](https://github.com/open-mmlab/mmsegmentation/pull/923))
-
-**Contributors**
-
-- @ShoupingShan made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/923
-- @RockeyCoss made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/954
-- @HarborYuan made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/992
-- @lkm2835 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1003
-- @gszh made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/428
-- @VVsssssk
-- @MengzhangLI
-- @Junjun2016
-
-### V0.18 (10/07/2021)
-
-**Highlights**
-
-- Support three real-time segmentation models (ICNet [#884](https://github.com/open-mmlab/mmsegmentation/pull/884), BiSeNetV1 [#851](https://github.com/open-mmlab/mmsegmentation/pull/851), and BiSeNetV2 [#804](https://github.com/open-mmlab/mmsegmentation/pull/804))
-- Support one efficient segmentation model (FastFCN [#885](https://github.com/open-mmlab/mmsegmentation/pull/885))
-- Support one efficient non-local/self-attention based segmentation model (ISANet [#70](https://github.com/open-mmlab/mmsegmentation/pull/70))
-- Support COCO-Stuff 10k and 164k datasets ([#625](https://github.com/open-mmlab/mmsegmentation/pull/625))
-- Support evaluate concated dataset separately ([#833](https://github.com/open-mmlab/mmsegmentation/pull/833))
-- Support loading GT for evaluation from multi-file backend ([#867](https://github.com/open-mmlab/mmsegmentation/pull/867))
-
-**New Features**
-
-- Support three real-time segmentation models (ICNet [#884](https://github.com/open-mmlab/mmsegmentation/pull/884), BiSeNetV1 [#851](https://github.com/open-mmlab/mmsegmentation/pull/851), and BiSeNetV2 [#804](https://github.com/open-mmlab/mmsegmentation/pull/804))
-- Support one efficient segmentation model (FastFCN [#885](https://github.com/open-mmlab/mmsegmentation/pull/885))
-- Support one efficient non-local/self-attention based segmentation model (ISANet [#70](https://github.com/open-mmlab/mmsegmentation/pull/70))
-- Support COCO-Stuff 10k and 164k datasets ([#625](https://github.com/open-mmlab/mmsegmentation/pull/625))
-- Support evaluate concated dataset separately ([#833](https://github.com/open-mmlab/mmsegmentation/pull/833))
-
-**Improvements**
-
-- Support loading GT for evaluation from multi-file backend ([#867](https://github.com/open-mmlab/mmsegmentation/pull/867))
-- Auto-convert SyncBN to BN when training on DP automatly([#772](https://github.com/open-mmlab/mmsegmentation/pull/772))
-- Refactor Swin-Transformer ([#800](https://github.com/open-mmlab/mmsegmentation/pull/800))
-
-**Bug Fixes**
-
-- Update mmcv installation in dockerfile ([#860](https://github.com/open-mmlab/mmsegmentation/pull/860))
-- Fix number of iteration bug when resuming checkpoint in distributed train ([#866](https://github.com/open-mmlab/mmsegmentation/pull/866))
-- Fix parsing parse in val_step ([#906](https://github.com/open-mmlab/mmsegmentation/pull/906))
-
-### V0.17 (09/01/2021)
-
-**Highlights**
-
-- Support SegFormer
-- Support DPT
-- Support Dark Zurich and Nighttime Driving datasets
-- Support progressive evaluation
-
-**New Features**
-
-- Support SegFormer ([#599](https://github.com/open-mmlab/mmsegmentation/pull/599))
-- Support DPT ([#605](https://github.com/open-mmlab/mmsegmentation/pull/605))
-- Support Dark Zurich and Nighttime Driving datasets ([#815](https://github.com/open-mmlab/mmsegmentation/pull/815))
-- Support progressive evaluation ([#709](https://github.com/open-mmlab/mmsegmentation/pull/709))
-
-**Improvements**
-
-- Add multiscale_output interface and unittests for HRNet ([#830](https://github.com/open-mmlab/mmsegmentation/pull/830))
-- Support inherit cityscapes dataset ([#750](https://github.com/open-mmlab/mmsegmentation/pull/750))
-- Fix some typos in README.md ([#824](https://github.com/open-mmlab/mmsegmentation/pull/824))
-- Delete convert function and add instruction to ViT/Swin README.md ([#791](https://github.com/open-mmlab/mmsegmentation/pull/791))
-- Add vit/swin/mit convert weight scripts ([#783](https://github.com/open-mmlab/mmsegmentation/pull/783))
-- Add copyright files ([#796](https://github.com/open-mmlab/mmsegmentation/pull/796))
-
-**Bug Fixes**
-
-- Fix invalid checkpoint link in inference_demo.ipynb ([#814](https://github.com/open-mmlab/mmsegmentation/pull/814))
-- Ensure that items in dataset have the same order across multi machine ([#780](https://github.com/open-mmlab/mmsegmentation/pull/780))
-- Fix the log error ([#766](https://github.com/open-mmlab/mmsegmentation/pull/766))
-
-### V0.16 (08/04/2021)
-
-**Highlights**
-
-- Support PyTorch 1.9
-- Support SegFormer backbone MiT
-- Support md2yml pre-commit hook
-- Support frozen stage for HRNet
-
-**New Features**
-
-- Support SegFormer backbone MiT ([#594](https://github.com/open-mmlab/mmsegmentation/pull/594))
-- Support md2yml pre-commit hook ([#732](https://github.com/open-mmlab/mmsegmentation/pull/732))
-- Support mim ([#717](https://github.com/open-mmlab/mmsegmentation/pull/717))
-- Add mmseg2torchserve tool ([#552](https://github.com/open-mmlab/mmsegmentation/pull/552))
-
-**Improvements**
-
-- Support hrnet frozen stage ([#743](https://github.com/open-mmlab/mmsegmentation/pull/743))
-- Add template of reimplementation questions ([#741](https://github.com/open-mmlab/mmsegmentation/pull/741))
-- Output pdf and epub formats for readthedocs ([#742](https://github.com/open-mmlab/mmsegmentation/pull/742))
-- Refine the docstring of ResNet ([#723](https://github.com/open-mmlab/mmsegmentation/pull/723))
-- Replace interpolate with resize ([#731](https://github.com/open-mmlab/mmsegmentation/pull/731))
-- Update resource limit ([#700](https://github.com/open-mmlab/mmsegmentation/pull/700))
-- Update config.md ([#678](https://github.com/open-mmlab/mmsegmentation/pull/678))
-
-**Bug Fixes**
-
-- Fix ATTENTION registry ([#729](https://github.com/open-mmlab/mmsegmentation/pull/729))
-- Fix analyze log script ([#716](https://github.com/open-mmlab/mmsegmentation/pull/716))
-- Fix doc api display ([#725](https://github.com/open-mmlab/mmsegmentation/pull/725))
-- Fix patch_embed and pos_embed mismatch error ([#685](https://github.com/open-mmlab/mmsegmentation/pull/685))
-- Fix efficient test for multi-node ([#707](https://github.com/open-mmlab/mmsegmentation/pull/707))
-- Fix init_cfg in resnet backbone ([#697](https://github.com/open-mmlab/mmsegmentation/pull/697))
-- Fix efficient test bug ([#702](https://github.com/open-mmlab/mmsegmentation/pull/702))
-- Fix url error in config docs ([#680](https://github.com/open-mmlab/mmsegmentation/pull/680))
-- Fix mmcv installation ([#676](https://github.com/open-mmlab/mmsegmentation/pull/676))
-- Fix torch version ([#670](https://github.com/open-mmlab/mmsegmentation/pull/670))
-
-**Contributors**
-
-@sshuair @xiexinch @Junjun2016 @mmeendez8 @xvjiarui @sennnnn @puhsu @BIGWangYuDong @keke1u @daavoo
-
-### V0.15 (07/04/2021)
-
-**Highlights**
-
-- Support ViT, SETR, and Swin-Transformer
-- Add Chinese documentation
-- Unified parameter initialization
-
-**Bug Fixes**
-
-- Fix typo and links ([#608](https://github.com/open-mmlab/mmsegmentation/pull/608))
-- Fix Dockerfile ([#607](https://github.com/open-mmlab/mmsegmentation/pull/607))
-- Fix ViT init ([#609](https://github.com/open-mmlab/mmsegmentation/pull/609))
-- Fix mmcv version compatible table ([#658](https://github.com/open-mmlab/mmsegmentation/pull/658))
-- Fix model links of DMNEt ([#660](https://github.com/open-mmlab/mmsegmentation/pull/660))
-
-**New Features**
-
-- Support loading DeiT weights ([#538](https://github.com/open-mmlab/mmsegmentation/pull/538))
-- Support SETR ([#531](https://github.com/open-mmlab/mmsegmentation/pull/531), [#635](https://github.com/open-mmlab/mmsegmentation/pull/635))
-- Add config and models for ViT backbone with UperHead ([#520](https://github.com/open-mmlab/mmsegmentation/pull/531), [#635](https://github.com/open-mmlab/mmsegmentation/pull/520))
-- Support Swin-Transformer ([#511](https://github.com/open-mmlab/mmsegmentation/pull/511))
-- Add higher accuracy FastSCNN ([#606](https://github.com/open-mmlab/mmsegmentation/pull/606))
-- Add Chinese documentation ([#666](https://github.com/open-mmlab/mmsegmentation/pull/666))
-
-**Improvements**
-
-- Unified parameter initialization ([#567](https://github.com/open-mmlab/mmsegmentation/pull/567))
-- Separate CUDA and CPU in  github action CI ([#602](https://github.com/open-mmlab/mmsegmentation/pull/602))
-- Support persistent dataloader worker ([#646](https://github.com/open-mmlab/mmsegmentation/pull/646))
-- Update meta file fields ([#661](https://github.com/open-mmlab/mmsegmentation/pull/661), [#664](https://github.com/open-mmlab/mmsegmentation/pull/664))
-
-### V0.14 (06/02/2021)
-
-**Highlights**
-
-- Support ONNX to TensorRT
-- Support MIM
-
-**Bug Fixes**
-
-- Fix ONNX to TensorRT verify ([#547](https://github.com/open-mmlab/mmsegmentation/pull/547))
-- Fix save best for EvalHook ([#575](https://github.com/open-mmlab/mmsegmentation/pull/575))
-
-**New Features**
-
-- Support loading DeiT weights ([#538](https://github.com/open-mmlab/mmsegmentation/pull/538))
-- Support ONNX to TensorRT ([#542](https://github.com/open-mmlab/mmsegmentation/pull/542))
-- Support output results for ADE20k ([#544](https://github.com/open-mmlab/mmsegmentation/pull/544))
-- Support MIM ([#549](https://github.com/open-mmlab/mmsegmentation/pull/549))
-
-**Improvements**
-
-- Add option for ViT output shape ([#530](https://github.com/open-mmlab/mmsegmentation/pull/530))
-- Infer batch size using len(result) ([#532](https://github.com/open-mmlab/mmsegmentation/pull/532))
-- Add compatible table between MMSeg and MMCV ([#558](https://github.com/open-mmlab/mmsegmentation/pull/558))
-
-### V0.13 (05/05/2021)
-
-**Highlights**
-
-- Support Pascal Context Class-59 dataset.
-- Support Visual Transformer Backbone.
-- Support mFscore metric.
-
-**Bug Fixes**
-
-- Fixed Colaboratory tutorial ([#451](https://github.com/open-mmlab/mmsegmentation/pull/451))
-- Fixed mIoU calculation range ([#471](https://github.com/open-mmlab/mmsegmentation/pull/471))
-- Fixed sem_fpn, unet README.md ([#492](https://github.com/open-mmlab/mmsegmentation/pull/492))
-- Fixed `num_classes` in FCN for Pascal Context 60-class dataset ([#488](https://github.com/open-mmlab/mmsegmentation/pull/488))
-- Fixed FP16 inference ([#497](https://github.com/open-mmlab/mmsegmentation/pull/497))
-
-**New Features**
-
-- Support dynamic export and visualize to pytorch2onnx ([#463](https://github.com/open-mmlab/mmsegmentation/pull/463))
-- Support export to torchscript ([#469](https://github.com/open-mmlab/mmsegmentation/pull/469), [#499](https://github.com/open-mmlab/mmsegmentation/pull/499))
-- Support Pascal Context Class-59 dataset ([#459](https://github.com/open-mmlab/mmsegmentation/pull/459))
-- Support Visual Transformer backbone ([#465](https://github.com/open-mmlab/mmsegmentation/pull/465))
-- Support UpSample Neck ([#512](https://github.com/open-mmlab/mmsegmentation/pull/512))
-- Support mFscore metric ([#509](https://github.com/open-mmlab/mmsegmentation/pull/509))
-
-**Improvements**
-
-- Add more CI for PyTorch ([#460](https://github.com/open-mmlab/mmsegmentation/pull/460))
-- Add print model graph args for tools/print_config.py ([#451](https://github.com/open-mmlab/mmsegmentation/pull/451))
-- Add cfg links in modelzoo README.md ([#468](https://github.com/open-mmlab/mmsegmentation/pull/469))
-- Add BaseSegmentor import to segmentors/__init__.py ([#495](https://github.com/open-mmlab/mmsegmentation/pull/495))
-- Add MMOCR, MMGeneration links ([#501](https://github.com/open-mmlab/mmsegmentation/pull/501), [#506](https://github.com/open-mmlab/mmsegmentation/pull/506))
-- Add Chinese QR code ([#506](https://github.com/open-mmlab/mmsegmentation/pull/506))
-- Use MMCV MODEL_REGISTRY ([#515](https://github.com/open-mmlab/mmsegmentation/pull/515))
-- Add ONNX testing tools ([#498](https://github.com/open-mmlab/mmsegmentation/pull/498))
-- Replace data_dict calling 'img' key to support MMDet3D ([#514](https://github.com/open-mmlab/mmsegmentation/pull/514))
-- Support reading class_weight from file in loss function ([#513](https://github.com/open-mmlab/mmsegmentation/pull/513))
-- Make tags as comment ([#505](https://github.com/open-mmlab/mmsegmentation/pull/505))
-- Use MMCV EvalHook ([#438](https://github.com/open-mmlab/mmsegmentation/pull/438))
-
-### V0.12 (04/03/2021)
-
-**Highlights**
-
-- Support FCN-Dilate 6 model.
-- Support Dice Loss.
-
-**Bug Fixes**
-
-- Fixed PhotoMetricDistortion Doc ([#388](https://github.com/open-mmlab/mmsegmentation/pull/388))
-- Fixed install scripts ([#399](https://github.com/open-mmlab/mmsegmentation/pull/399))
-- Fixed Dice Loss multi-class ([#417](https://github.com/open-mmlab/mmsegmentation/pull/417))
-
-**New Features**
-
-- Support Dice Loss ([#396](https://github.com/open-mmlab/mmsegmentation/pull/396))
-- Add plot logs tool ([#426](https://github.com/open-mmlab/mmsegmentation/pull/426))
-- Add opacity option to show_result ([#425](https://github.com/open-mmlab/mmsegmentation/pull/425))
-- Speed up mIoU metric ([#430](https://github.com/open-mmlab/mmsegmentation/pull/430))
-
-**Improvements**
-
-- Refactor unittest file structure ([#440](https://github.com/open-mmlab/mmsegmentation/pull/440))
-- Fix typos in the repo ([#449](https://github.com/open-mmlab/mmsegmentation/pull/449))
-- Include class-level metrics in the log ([#445](https://github.com/open-mmlab/mmsegmentation/pull/445))
-
-### V0.11 (02/02/2021)
-
-**Highlights**
-
-- Support memory efficient test, add more UNet models.
-
-**Bug Fixes**
-
-- Fixed TTA resize scale ([#334](https://github.com/open-mmlab/mmsegmentation/pull/334))
-- Fixed CI for pip 20.3 ([#307](https://github.com/open-mmlab/mmsegmentation/pull/307))
-- Fixed ADE20k test ([#359](https://github.com/open-mmlab/mmsegmentation/pull/359))
-
-**New Features**
-
-- Support memory efficient test ([#330](https://github.com/open-mmlab/mmsegmentation/pull/330))
-- Add more UNet benchmarks ([#324](https://github.com/open-mmlab/mmsegmentation/pull/324))
-- Support Lovasz Loss ([#351](https://github.com/open-mmlab/mmsegmentation/pull/351))
-
-**Improvements**
-
-- Move train_cfg/test_cfg inside model ([#341](https://github.com/open-mmlab/mmsegmentation/pull/341))
-
-### V0.10 (01/01/2021)
-
-**Highlights**
-
-- Support MobileNetV3, DMNet, APCNet. Add models of ResNet18V1b, ResNet18V1c, ResNet50V1b.
-
-**Bug Fixes**
-
-- Fixed CPU TTA ([#276](https://github.com/open-mmlab/mmsegmentation/pull/276))
-- Fixed CI for pip 20.3 ([#307](https://github.com/open-mmlab/mmsegmentation/pull/307))
-
-**New Features**
-
-- Add ResNet18V1b, ResNet18V1c, ResNet50V1b, ResNet101V1b models ([#316](https://github.com/open-mmlab/mmsegmentation/pull/316))
-- Support MobileNetV3 ([#268](https://github.com/open-mmlab/mmsegmentation/pull/268))
-- Add 4 retinal vessel segmentation benchmark  ([#315](https://github.com/open-mmlab/mmsegmentation/pull/315))
-- Support DMNet ([#313](https://github.com/open-mmlab/mmsegmentation/pull/313))
-- Support APCNet ([#299](https://github.com/open-mmlab/mmsegmentation/pull/299))
-
-**Improvements**
-
-- Refactor Documentation page ([#311](https://github.com/open-mmlab/mmsegmentation/pull/311))
-- Support resize data augmentation according to original image size ([#291](https://github.com/open-mmlab/mmsegmentation/pull/291))
-
-### V0.9 (30/11/2020)
-
-**Highlights**
-
-- Support 4 medical dataset, UNet and CGNet.
-
-**New Features**
-
-- Support RandomRotate transform ([#215](https://github.com/open-mmlab/mmsegmentation/pull/215), [#260](https://github.com/open-mmlab/mmsegmentation/pull/260))
-- Support RGB2Gray transform ([#227](https://github.com/open-mmlab/mmsegmentation/pull/227))
-- Support Rerange transform ([#228](https://github.com/open-mmlab/mmsegmentation/pull/228))
-- Support ignore_index for BCE loss ([#210](https://github.com/open-mmlab/mmsegmentation/pull/210))
-- Add modelzoo statistics ([#263](https://github.com/open-mmlab/mmsegmentation/pull/263))
-- Support Dice evaluation metric ([#225](https://github.com/open-mmlab/mmsegmentation/pull/225))
-- Support Adjust Gamma transform ([#232](https://github.com/open-mmlab/mmsegmentation/pull/232))
-- Support CLAHE transform ([#229](https://github.com/open-mmlab/mmsegmentation/pull/229))
-
-**Bug Fixes**
-
-- Fixed detail API link ([#267](https://github.com/open-mmlab/mmsegmentation/pull/267))
-
-### V0.8 (03/11/2020)
-
-**Highlights**
-
-- Support 4 medical dataset, UNet and CGNet.
-
-**New Features**
-
-- Support customize runner ([#118](https://github.com/open-mmlab/mmsegmentation/pull/118))
-- Support UNet ([#161](https://github.com/open-mmlab/mmsegmentation/pull/162))
-- Support CHASE_DB1, DRIVE, STARE, HRD ([#203](https://github.com/open-mmlab/mmsegmentation/pull/203))
-- Support CGNet ([#223](https://github.com/open-mmlab/mmsegmentation/pull/223))
-
-### V0.7 (07/10/2020)
-
-**Highlights**
-
-- Support Pascal Context dataset and customizing class dataset.
-
-**Bug Fixes**
-
-- Fixed CPU inference ([#153](https://github.com/open-mmlab/mmsegmentation/pull/153))
-
-**New Features**
-
-- Add DeepLab OS16 models ([#154](https://github.com/open-mmlab/mmsegmentation/pull/154))
-- Support Pascal Context dataset ([#133](https://github.com/open-mmlab/mmsegmentation/pull/133))
-- Support customizing dataset classes ([#71](https://github.com/open-mmlab/mmsegmentation/pull/71))
-- Support customizing dataset palette ([#157](https://github.com/open-mmlab/mmsegmentation/pull/157))
-
-**Improvements**
-
-- Support 4D tensor output in ONNX ([#150](https://github.com/open-mmlab/mmsegmentation/pull/150))
-- Remove redundancies in ONNX export ([#160](https://github.com/open-mmlab/mmsegmentation/pull/160))
-- Migrate to MMCV DepthwiseSeparableConv ([#158](https://github.com/open-mmlab/mmsegmentation/pull/158))
-- Migrate to MMCV collect_env ([#137](https://github.com/open-mmlab/mmsegmentation/pull/137))
-- Use img_prefix and seg_prefix for loading ([#153](https://github.com/open-mmlab/mmsegmentation/pull/153))
-
-### V0.6 (10/09/2020)
-
-**Highlights**
-
-- Support new methods i.e. MobileNetV2, EMANet, DNL, PointRend, Semantic FPN, Fast-SCNN, ResNeSt.
-
-**Bug Fixes**
-
-- Fixed sliding inference ONNX export ([#90](https://github.com/open-mmlab/mmsegmentation/pull/90))
-
-**New Features**
-
-- Support MobileNet v2 ([#86](https://github.com/open-mmlab/mmsegmentation/pull/86))
-- Support EMANet ([#34](https://github.com/open-mmlab/mmsegmentation/pull/34))
-- Support DNL ([#37](https://github.com/open-mmlab/mmsegmentation/pull/37))
-- Support PointRend ([#109](https://github.com/open-mmlab/mmsegmentation/pull/109))
-- Support Semantic FPN ([#94](https://github.com/open-mmlab/mmsegmentation/pull/94))
-- Support Fast-SCNN ([#58](https://github.com/open-mmlab/mmsegmentation/pull/58))
-- Support ResNeSt backbone ([#47](https://github.com/open-mmlab/mmsegmentation/pull/47))
-- Support ONNX export (experimental) ([#12](https://github.com/open-mmlab/mmsegmentation/pull/12))
-
-**Improvements**
-
-- Support Upsample in ONNX ([#100](https://github.com/open-mmlab/mmsegmentation/pull/100))
-- Support Windows install (experimental) ([#75](https://github.com/open-mmlab/mmsegmentation/pull/75))
-- Add more OCRNet results ([#20](https://github.com/open-mmlab/mmsegmentation/pull/20))
-- Add PyTorch 1.6 CI ([#64](https://github.com/open-mmlab/mmsegmentation/pull/64))
-- Get version and githash automatically ([#55](https://github.com/open-mmlab/mmsegmentation/pull/55))
-
-### v0.5.1 (11/08/2020)
-
-**Highlights**
-
-- Support FP16 and more generalized OHEM
-
-**Bug Fixes**
-
-- Fixed Pascal VOC conversion script (#19)
-- Fixed OHEM weight assign bug (#54)
-- Fixed palette type when palette is not given (#27)
-
-**New Features**
-
-- Support FP16 (#21)
-- Generalized OHEM (#54)
-
-**Improvements**
-
-- Add load-from flag (#33)
-- Fixed training tricks doc about different learning rates of model (#26)
diff --git a/docs/en/conf.py b/docs/en/conf.py
index 87b16f2667..e20aab14b1 100644
--- a/docs/en/conf.py
+++ b/docs/en/conf.py
@@ -28,7 +28,7 @@
 
 
 def get_version():
-    with open(version_file, 'r') as f:
+    with open(version_file) as f:
         exec(compile(f.read(), version_file, 'exec'))
     return locals()['__version__']
 
diff --git a/docs/en/dataset_prepare.md b/docs/en/dataset_prepare.md
deleted file mode 100644
index 4982ce1828..0000000000
--- a/docs/en/dataset_prepare.md
+++ /dev/null
@@ -1,378 +0,0 @@
-## Prepare datasets
-
-It is recommended to symlink the dataset root to `$MMSEGMENTATION/data`.
-If your folder structure is different, you may need to change the corresponding paths in config files.
-
-```none
-mmsegmentation
-├── mmseg
-├── tools
-├── configs
-├── data
-│   ├── cityscapes
-│   │   ├── leftImg8bit
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   ├── gtFine
-│   │   │   ├── train
-│   │   │   ├── val
-│   ├── VOCdevkit
-│   │   ├── VOC2012
-│   │   │   ├── JPEGImages
-│   │   │   ├── SegmentationClass
-│   │   │   ├── ImageSets
-│   │   │   │   ├── Segmentation
-│   │   ├── VOC2010
-│   │   │   ├── JPEGImages
-│   │   │   ├── SegmentationClassContext
-│   │   │   ├── ImageSets
-│   │   │   │   ├── SegmentationContext
-│   │   │   │   │   ├── train.txt
-│   │   │   │   │   ├── val.txt
-│   │   │   ├── trainval_merged.json
-│   │   ├── VOCaug
-│   │   │   ├── dataset
-│   │   │   │   ├── cls
-│   ├── ade
-│   │   ├── ADEChallengeData2016
-│   │   │   ├── annotations
-│   │   │   │   ├── training
-│   │   │   │   ├── validation
-│   │   │   ├── images
-│   │   │   │   ├── training
-│   │   │   │   ├── validation
-│   ├── coco_stuff10k
-│   │   ├── images
-│   │   │   ├── train2014
-│   │   │   ├── test2014
-│   │   ├── annotations
-│   │   │   ├── train2014
-│   │   │   ├── test2014
-│   │   ├── imagesLists
-│   │   │   ├── train.txt
-│   │   │   ├── test.txt
-│   │   │   ├── all.txt
-│   ├── coco_stuff164k
-│   │   ├── images
-│   │   │   ├── train2017
-│   │   │   ├── val2017
-│   │   ├── annotations
-│   │   │   ├── train2017
-│   │   │   ├── val2017
-│   ├── CHASE_DB1
-│   │   ├── images
-│   │   │   ├── training
-│   │   │   ├── validation
-│   │   ├── annotations
-│   │   │   ├── training
-│   │   │   ├── validation
-│   ├── DRIVE
-│   │   ├── images
-│   │   │   ├── training
-│   │   │   ├── validation
-│   │   ├── annotations
-│   │   │   ├── training
-│   │   │   ├── validation
-│   ├── HRF
-│   │   ├── images
-│   │   │   ├── training
-│   │   │   ├── validation
-│   │   ├── annotations
-│   │   │   ├── training
-│   │   │   ├── validation
-│   ├── STARE
-│   │   ├── images
-│   │   │   ├── training
-│   │   │   ├── validation
-│   │   ├── annotations
-│   │   │   ├── training
-│   │   │   ├── validation
-|   ├── dark_zurich
-|   │   ├── gps
-|   │   │   ├── val
-|   │   │   └── val_ref
-|   │   ├── gt
-|   │   │   └── val
-|   │   ├── LICENSE.txt
-|   │   ├── lists_file_names
-|   │   │   ├── val_filenames.txt
-|   │   │   └── val_ref_filenames.txt
-|   │   ├── README.md
-|   │   └── rgb_anon
-|   │   |   ├── val
-|   │   |   └── val_ref
-|   ├── NighttimeDrivingTest
-|   |   ├── gtCoarse_daytime_trainvaltest
-|   |   │   └── test
-|   |   │       └── night
-|   |   └── leftImg8bit
-|   |   |   └── test
-|   |   |       └── night
-│   ├── loveDA
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   │   ├── test
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   ├── potsdam
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   ├── vaihingen
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   ├── iSAID
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   │   ├── test
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   ├── val
-```
-
-### Cityscapes
-
-The data could be found [here](https://www.cityscapes-dataset.com/downloads/) after registration.
-
-By convention, `**labelTrainIds.png` are used for cityscapes training.
-We provided a [scripts](https://github.com/open-mmlab/mmsegmentation/blob/master/tools/convert_datasets/cityscapes.py) based on [cityscapesscripts](https://github.com/mcordts/cityscapesScripts)
-to generate `**labelTrainIds.png`.
-
-```shell
-# --nproc means 8 process for conversion, which could be omitted as well.
-python tools/convert_datasets/cityscapes.py data/cityscapes --nproc 8
-```
-
-### Pascal VOC
-
-Pascal VOC 2012 could be downloaded from [here](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar).
-Beside, most recent works on Pascal VOC dataset usually exploit extra augmentation data, which could be found [here](http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz).
-
-If you would like to use augmented VOC dataset, please run following command to convert augmentation annotations into proper format.
-
-```shell
-# --nproc means 8 process for conversion, which could be omitted as well.
-python tools/convert_datasets/voc_aug.py data/VOCdevkit data/VOCdevkit/VOCaug --nproc 8
-```
-
-Please refer to [concat dataset](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/tutorials/customize_datasets.md#concatenate-dataset) for details about how to concatenate them and train them together.
-
-### ADE20K
-
-The training and validation set of ADE20K could be download from this [link](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip).
-We may also download test set from [here](http://data.csail.mit.edu/places/ADEchallenge/release_test.zip).
-
-### Pascal Context
-
-The training and validation set of Pascal Context could be download from [here](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar). You may also download test set from [here](http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2010test.tar) after registration.
-
-To split the training and validation set from original dataset, you may download trainval_merged.json from [here](https://codalabuser.blob.core.windows.net/public/trainval_merged.json).
-
-If you would like to use Pascal Context dataset, please install [Detail](https://github.com/zhanghang1989/detail-api) and then run the following command to convert annotations into proper format.
-
-```shell
-python tools/convert_datasets/pascal_context.py data/VOCdevkit data/VOCdevkit/VOC2010/trainval_merged.json
-```
-
-### COCO Stuff 10k
-
-The data could be downloaded [here](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip) by wget.
-
-For COCO Stuff 10k dataset, please run the following commands to download and convert the dataset.
-
-```shell
-# download
-mkdir coco_stuff10k && cd coco_stuff10k
-wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip
-
-# unzip
-unzip cocostuff-10k-v1.1.zip
-
-# --nproc means 8 process for conversion, which could be omitted as well.
-python tools/convert_datasets/coco_stuff10k.py /path/to/coco_stuff10k --nproc 8
-```
-
-By convention, mask labels in `/path/to/coco_stuff164k/annotations/*2014/*_labelTrainIds.png` are used for COCO Stuff 10k training and testing.
-
-### COCO Stuff 164k
-
-For COCO Stuff 164k dataset, please run the following commands to download and convert the augmented dataset.
-
-```shell
-# download
-mkdir coco_stuff164k && cd coco_stuff164k
-wget http://images.cocodataset.org/zips/train2017.zip
-wget http://images.cocodataset.org/zips/val2017.zip
-wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip
-
-# unzip
-unzip train2017.zip -d images/
-unzip val2017.zip -d images/
-unzip stuffthingmaps_trainval2017.zip -d annotations/
-
-# --nproc means 8 process for conversion, which could be omitted as well.
-python tools/convert_datasets/coco_stuff164k.py /path/to/coco_stuff164k --nproc 8
-```
-
-By convention, mask labels in `/path/to/coco_stuff164k/annotations/*2017/*_labelTrainIds.png` are used for COCO Stuff 164k training and testing.
-
-The details of this dataset could be found at [here](https://github.com/nightrome/cocostuff#downloads).
-
-### CHASE DB1
-
-The training and validation set of CHASE DB1 could be download from [here](https://staffnet.kingston.ac.uk/~ku15565/CHASE_DB1/assets/CHASEDB1.zip).
-
-To convert CHASE DB1 dataset to MMSegmentation format, you should run the following command:
-
-```shell
-python tools/convert_datasets/chase_db1.py /path/to/CHASEDB1.zip
-```
-
-The script will make directory structure automatically.
-
-### DRIVE
-
-The training and validation set of DRIVE could be download from [here](https://drive.grand-challenge.org/). Before that, you should register an account. Currently '1st_manual' is not provided officially.
-
-To convert DRIVE dataset to MMSegmentation format, you should run the following command:
-
-```shell
-python tools/convert_datasets/drive.py /path/to/training.zip /path/to/test.zip
-```
-
-The script will make directory structure automatically.
-
-### HRF
-
-First, download [healthy.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/healthy.zip), [glaucoma.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/glaucoma.zip), [diabetic_retinopathy.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/diabetic_retinopathy.zip), [healthy_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/healthy_manualsegm.zip), [glaucoma_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/glaucoma_manualsegm.zip) and [diabetic_retinopathy_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/diabetic_retinopathy_manualsegm.zip).
-
-To convert HRF dataset to MMSegmentation format, you should run the following command:
-
-```shell
-python tools/convert_datasets/hrf.py /path/to/healthy.zip /path/to/healthy_manualsegm.zip /path/to/glaucoma.zip /path/to/glaucoma_manualsegm.zip /path/to/diabetic_retinopathy.zip /path/to/diabetic_retinopathy_manualsegm.zip
-```
-
-The script will make directory structure automatically.
-
-### STARE
-
-First, download [stare-images.tar](http://cecas.clemson.edu/~ahoover/stare/probing/stare-images.tar), [labels-ah.tar](http://cecas.clemson.edu/~ahoover/stare/probing/labels-ah.tar) and [labels-vk.tar](http://cecas.clemson.edu/~ahoover/stare/probing/labels-vk.tar).
-
-To convert STARE dataset to MMSegmentation format, you should run the following command:
-
-```shell
-python tools/convert_datasets/stare.py /path/to/stare-images.tar /path/to/labels-ah.tar /path/to/labels-vk.tar
-```
-
-The script will make directory structure automatically.
-
-### Dark Zurich
-
-Since we only support test models on this dataset, you may only download [the validation set](https://data.vision.ee.ethz.ch/csakarid/shared/GCMA_UIoU/Dark_Zurich_val_anon.zip).
-
-### Nighttime Driving
-
-Since we only support test models on this dataset, you may only download [the test set](http://data.vision.ee.ethz.ch/daid/NighttimeDriving/NighttimeDrivingTest.zip).
-
-### LoveDA
-
-The data could be downloaded from Google Drive [here](https://drive.google.com/drive/folders/1ibYV0qwn4yuuh068Rnc-w4tPi0U0c-ti?usp=sharing).
-
-Or it can be downloaded from [zenodo](https://zenodo.org/record/5706578#.YZvN7SYRXdF), you should run the following command:
-
-```shell
-# Download Train.zip
-wget https://zenodo.org/record/5706578/files/Train.zip
-# Download Val.zip
-wget https://zenodo.org/record/5706578/files/Val.zip
-# Download Test.zip
-wget https://zenodo.org/record/5706578/files/Test.zip
-```
-
-For LoveDA dataset, please run the following command to download and re-organize the dataset.
-
-```shell
-python tools/convert_datasets/loveda.py /path/to/loveDA
-```
-
-Using trained model to predict test set of LoveDA and submit it to server can be found [here](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/inference.md).
-
-More details about LoveDA can be found [here](https://github.com/Junjue-Wang/LoveDA).
-
-### ISPRS Potsdam
-
-The [Potsdam](https://www2.isprs.org/commissions/comm2/wg4/benchmark/2d-sem-label-potsdam/)
-dataset is for urban semantic segmentation used in the 2D Semantic Labeling Contest - Potsdam.
-
-The dataset can be requested at the challenge [homepage](https://www2.isprs.org/commissions/comm2/wg4/benchmark/data-request-form/).
-The '2_Ortho_RGB.zip' and '5_Labels_all_noBoundary.zip' are required.
-
-For Potsdam dataset, please run the following command to download and re-organize the dataset.
-
-```shell
-python tools/convert_datasets/potsdam.py /path/to/potsdam
-```
-
-In our default setting, it will generate 3456 images for training and 2016 images for validation.
-
-### ISPRS Vaihingen
-
-The [Vaihingen](https://www2.isprs.org/commissions/comm2/wg4/benchmark/2d-sem-label-vaihingen/)
-dataset is for urban semantic segmentation used in the 2D Semantic Labeling Contest - Vaihingen.
-
-The dataset can be requested at the challenge [homepage](https://www2.isprs.org/commissions/comm2/wg4/benchmark/data-request-form/).
-The 'ISPRS_semantic_labeling_Vaihingen.zip' and 'ISPRS_semantic_labeling_Vaihingen_ground_truth_eroded_COMPLETE.zip' are required.
-
-For Vaihingen dataset, please run the following command to download and re-organize the dataset.
-
-```shell
-python tools/convert_datasets/vaihingen.py /path/to/vaihingen
-```
-
-In our default setting (`clip_size` =512, `stride_size`=256), it will generate 344 images for training and 398 images for validation.
-
-### iSAID
-
-The data images could be download from [DOTA-v1.0](https://captain-whu.github.io/DOTA/dataset.html) (train/val/test)
-
-The data annotations could be download from [iSAID](https://captain-whu.github.io/iSAID/dataset.html) (train/val)
-
-The dataset is a Large-scale Dataset for Instance Segmentation (also have segmantic segmentation) in Aerial Images.
-
-You may need to follow the following structure for dataset preparation after downloading iSAID dataset.
-
-```
-│   ├── iSAID
-│   │   ├── train
-│   │   │   ├── images
-│   │   │   │   ├── part1.zip
-│   │   │   │   ├── part2.zip
-│   │   │   │   ├── part3.zip
-│   │   │   ├── Semantic_masks
-│   │   │   │   ├── images.zip
-│   │   ├── val
-│   │   │   ├── images
-│   │   │   │   ├── part1.zip
-│   │   │   ├── Semantic_masks
-│   │   │   │   ├── images.zip
-│   │   ├── test
-│   │   │   ├── images
-│   │   │   │   ├── part1.zip
-│   │   │   │   ├── part2.zip
-```
-
-```shell
-python tools/convert_datasets/isaid.py /path/to/iSAID
-```
-
-In our default setting (`patch_width`=896, `patch_height`=896,　`overlap_area`=384), it will generate 33978 images for training and 11644 images for validation.
diff --git a/docs/en/device/npu.md b/docs/en/device/npu.md
new file mode 100644
index 0000000000..a90d6ac433
--- /dev/null
+++ b/docs/en/device/npu.md
@@ -0,0 +1,39 @@
+# NPU (HUAWEI Ascend)
+
+## Usage
+
+Please refer to the [building documentation of MMCV](https://mmcv.readthedocs.io/en/latest/get_started/build.html#build-mmcv-full-on-ascend-npu-machine) to install MMCV on NPU devices
+
+Here we use 4 NPUs on your computer to train the model with the following command:
+
+```shell
+bash tools/dist_train.sh configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py 4
+```
+
+Also, you can use only one NPU to train the model with the following command:
+
+```shell
+python tools/train.py configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
+```
+
+## Models Results
+
+|        Model        | mIoU  | Config                                                                                                                                     | Download                                                                                                                                    |
+| :-----------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------ |
+|   [deeplabv3](<>)   | 78.85 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py)         | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024_20230115_205626.json)     |
+| [deeplabv3plus](<>) | 79.23 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024_20230116_043450.json) |
+|     [hrnet](<>)     | 78.1  | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py)                     | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/fcn_hr18_4xb2-40k_cityscapes-512x1024_20230116_215821.json)             |
+|      [fcn](<>)      | 74.15 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py)                     | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/fcn_r50-d8_4xb2-40k_cityscapes-512x1024_20230111_083014.json)           |
+|     [icnet](<>)     | 69.25 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py)                  | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/icnet_r50-d8_4xb2-80k_cityscapes-832x832_20230119_002929.json)          |
+|    [pspnet](<>)     | 77.21 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py)              | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024_20230114_042721.json)       |
+|     [unet](<>)      | 68.86 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py)              | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024_20230129_224750.json)     |
+|    [upernet](<>)    | 77.81 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py)                | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/upernet_r50_4xb2-40k_cityscapes-512x1024_20230129_014634.json)          |
+|    [apcnet](<>)     | 78.02 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)               | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024_20230209_212545.json)        |
+|   [bisenetv1](<>)   | 76.04 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py)      | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024_20230201_023946.json)  |
+|   [bisenetv2](<>)   | 72.44 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py)      | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024_20230205_215606.json)  |
+
+**Notes:**
+
+- If not specially marked, the results on NPU with amp are the basically same as those on the GPU with FP32.
+
+**All above models are provided by Huawei Ascend group.**
diff --git a/docs/en/faq.md b/docs/en/faq.md
deleted file mode 100644
index 3de7addfa2..0000000000
--- a/docs/en/faq.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# Frequently Asked Questions (FAQ)
-
-We list some common troubles faced by many users and their corresponding solutions here. Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. If the contents here do not cover your issue, please create an issue using the [provided templates](https://github.com/open-mmlab/mmsegmentation/blob/master/.github/ISSUE_TEMPLATE/error-report.md/) and make sure you fill in all required information in the template.
-
-## How to know the number of GPUs needed to train the model
-
-- Infer from the name of the config file of the model. You can refer to the `Config Name Style` part of [Learn about Configs](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/tutorials/config.md). For example, for config file with name `segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py`, `8x1` means training the model corresponding to it needs 8 GPUs, and the batch size of each GPU is 1.
-- Infer from the log file. Open the log file of the model and search `nGPU` in the file. The number of figures following `nGPU` is the number of GPUs needed to train the model. For instance, searching for `nGPU` in the log file yields the record `nGPU 0,1,2,3,4,5,6,7`, which indicates that eight GPUs are needed to train the model.
diff --git a/docs/en/get_started.md b/docs/en/get_started.md
index 2762ea5198..3f957eb4e1 100644
--- a/docs/en/get_started.md
+++ b/docs/en/get_started.md
@@ -1,264 +1,211 @@
+# Get started: Install and Run MMSeg
+
 ## Prerequisites
 
-- Linux or macOS (Windows is in experimental support)
-- Python 3.6+
-- PyTorch 1.3+
-- CUDA 9.2+ (If you build PyTorch from source, CUDA 9.0 is also compatible)
-- GCC 5+
-- [MMCV](https://mmcv.readthedocs.io/en/latest/#installation)
-
-The compatible MMSegmentation and MMCV versions are as below. Please install the correct version of MMCV to avoid installation issues.
-
-| MMSegmentation version |        MMCV version         | MMClassification version |
-| :--------------------: | :-------------------------: | :----------------------: |
-|         master         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.24.1         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.23.0         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.22.0         | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0  |
-|         0.21.1         | mmcv-full>=1.4.4, \<=1.6.0  |       Not required       |
-|         0.20.2         | mmcv-full>=1.3.13, \<=1.6.0 |       Not required       |
-|         0.19.0         | mmcv-full>=1.3.13, \<1.3.17 |       Not required       |
-|         0.18.0         | mmcv-full>=1.3.13, \<1.3.17 |       Not required       |
-|         0.17.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.16.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.15.0         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.14.1         | mmcv-full>=1.3.7, \<1.3.17  |       Not required       |
-|         0.14.0         |  mmcv-full>=1.3.1, \<1.3.2  |       Not required       |
-|         0.13.0         |  mmcv-full>=1.3.1, \<1.3.2  |       Not required       |
-|         0.12.0         |  mmcv-full>=1.1.4, \<1.3.2  |       Not required       |
-|         0.11.0         |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.10.0         |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.9.0          |  mmcv-full>=1.1.4, \<1.3.0  |       Not required       |
-|         0.8.0          |  mmcv-full>=1.1.4, \<1.2.0  |       Not required       |
-|         0.7.0          |  mmcv-full>=1.1.2, \<1.2.0  |       Not required       |
-|         0.6.0          |  mmcv-full>=1.1.2, \<1.2.0  |       Not required       |
-
-:::{note}
-You need to run `pip uninstall mmcv` first if you have mmcv installed.
-If mmcv and mmcv-full are both installed, there will be `ModuleNotFoundError`.
-:::
+In this section we demonstrate how to prepare an environment with PyTorch.
 
-## Installation
+MMSegmentation works on Linux, Windows and macOS. It requires Python 3.7+, CUDA 10.2+ and PyTorch 1.8+.
 
-a. Create a conda virtual environment and activate it.
+**Note:**
+If you are experienced with PyTorch and have already installed it, just skip this part and jump to the [next section](##installation). Otherwise, you can follow these steps for the preparation.
 
-```shell
-conda create -n open-mmlab python=3.10 -y
-conda activate open-mmlab
-```
+**Step 0.** Download and install Miniconda from the [official website](https://docs.conda.io/en/latest/miniconda.html).
 
-b. Install PyTorch and torchvision following the [official instructions](https://pytorch.org/).
-Here we use PyTorch 1.11.0 and CUDA 11.3.
-You may also switch to other version by specifying the version number.
+**Step 1.** Create a conda environment and activate it.
 
 ```shell
-conda install pytorch=1.11.0 torchvision cudatoolkit=11.3 -c pytorch
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
 ```
 
-c. Install [MMCV](https://mmcv.readthedocs.io/en/latest/) following the [official instructions](https://mmcv.readthedocs.io/en/latest/#installation).
-Either `mmcv` or `mmcv-full` is compatible with MMSegmentation, but for methods like CCNet and PSANet, CUDA ops in `mmcv-full` is required.
-
-**Install mmcv for Linux:**
-
-Install MMCV, we recommend you to install the pre-built mmcv as below.
-
-```shell
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
-```
+**Step 2.** Install PyTorch following [official instructions](https://pytorch.org/get-started/locally/), e.g.
 
-Please replace `{cu_version}` and `{torch_version}` in the url to your desired one. mmcv-full is only compiled on
-PyTorch 1.x.0 because the compatibility usually holds between 1.x.0 and 1.x.1. If your PyTorch version is 1.x.1,
-you can install mmcv-full compiled with PyTorch 1.x.0 and it usually works well.
-For example, to install the `mmcv-full` with `CUDA 11.3` and `PyTorch 1.11.0`, use the following command:
+On GPU platforms:
 
 ```shell
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html
+conda install pytorch torchvision -c pytorch
 ```
 
-See [here](https://github.com/open-mmlab/mmcv#installation) for different versions of MMCV compatible to different PyTorch and CUDA versions.
-
-Optionally you can choose to compile mmcv from source by the following command
+On CPU platforms:
 
 ```shell
-git clone https://github.com/open-mmlab/mmcv.git
-cd mmcv
-MMCV_WITH_OPS=1 pip install -e .  # package mmcv-full, which contains cuda ops, will be installed after this step
-# OR pip install -e .  # package mmcv, which contains no cuda ops, will be installed after this step
-cd ..
+conda install pytorch torchvision cpuonly -c pytorch
 ```
 
-**Important:** You need to run `pip uninstall mmcv` first if you have mmcv installed. Because if `mmcv` and `mmcv-full` are both installed, there will be `ModuleNotFoundError`.
+## Installation
 
-**Install mmcv for Windows (Experimental):**
+We recommend that users follow our best practices to install MMSegmentation. However, the whole process is highly customizable. See [Customize Installation](#customize-installation) section for more information.
 
-For Windows, the installation of MMCV requires native C++ compilers, such as cl.exe. Please add the compiler to %PATH%.
+### Best Practices
 
-A typical path for cl.exe looks like the following if you have Windows SDK and Visual Studio installed on your computer:
+**Step 0.** Install [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim).
 
 ```shell
-C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.26.28801\bin\Hostx86\x64
+pip install -U openmim
+mim install mmengine
+mim install "mmcv>=2.0.0"
 ```
 
-Or you should download the cl compiler from web and then set up the path.
+**Step 1.** Install MMSegmentation.
 
-Then, clone mmcv from github and install mmcv via pip:
+Case a: If you develop and run mmseg directly, install it from source:
 
 ```shell
-git clone https://github.com/open-mmlab/mmcv.git
-cd mmcv
-pip install -e .
+git clone -b main https://github.com/open-mmlab/mmsegmentation.git
+cd mmsegmentation
+pip install -v -e .
+# '-v' means verbose, or more output
+# '-e' means installing a project in editable mode,
+# thus any local modifications made to the code will take effect without reinstallation.
 ```
 
-Or simply:
+Case b: If you use mmsegmentation as a dependency or third-party package, install it with pip:
 
 ```shell
-pip install mmcv
+pip install "mmsegmentation>=1.0.0"
 ```
 
-Currently, mmcv-full is not supported on Windows.
+### Verify the installation
 
-d. Install MMSegmentation.
+To verify whether MMSegmentation is installed correctly, we provide some sample codes to run an inference demo.
+
+**Step 1.** We need to download config and checkpoint files.
 
 ```shell
-pip install mmsegmentation # install the latest release
+mim download mmsegmentation --config pspnet_r50-d8_4xb2-40k_cityscapes-512x1024 --dest .
 ```
 
-or
+The downloading will take several seconds or more, depending on your network environment. When it is done, you will find two files `pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py` and `pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth` in your current folder.
 
-```shell
-pip install git+https://github.com/open-mmlab/mmsegmentation.git # install the master branch
-```
+**Step 2.** Verify the inference demo.
 
-Instead, if you would like to install MMSegmentation in `dev` mode, run following
+Option (a). If you install mmsegmentation from source, just run the following command.
 
 ```shell
-git clone https://github.com/open-mmlab/mmsegmentation.git
-cd mmsegmentation
-pip install -e .  # or "python setup.py develop"
+python demo/image_demo.py demo/demo.png configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth --device cuda:0 --out-file result.jpg
 ```
 
-:::{note}
+You will see a new image `result.jpg` on your current folder, where segmentation masks are covered on all objects.
 
-1. When training or testing models on Windows, please ensure that all the '\\' in paths are replaced with '/'. Add .replace('\\', '/') to your python code wherever path strings occur.
-2. The `version+git_hash` will also be saved in trained models meta, e.g. 0.5.0+c415a2e.
-3. When MMsegmentation is installed on `dev` mode, any local modifications made to the code will take effect without the need to reinstall it.
-4. If you would like to use `opencv-python-headless` instead of `opencv-python`,
-   you can install it before installing MMCV.
-5. Some dependencies are optional. Simply running `pip install -e .` will only install the minimum runtime requirements.
-   To use optional dependencies like `cityscapessripts`  either install them manually with `pip install -r requirements/optional.txt` or specify desired extras when calling `pip` (e.g. `pip install -e .[optional]`). Valid keys for the extras field are: `all`, `tests`, `build`, and `optional`.
-   :::
+Option (b). If you install mmsegmentation with pip, open you python interpreter and copy&paste the following codes.
 
-### A from-scratch setup script
+```python
+from mmseg.apis import inference_model, init_model, show_result_pyplot
+import mmcv
 
-#### Linux
+config_file = 'pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+checkpoint_file = 'pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
 
-Here is a full script for setting up mmsegmentation with conda and link the dataset path (supposing that your dataset path is $DATA_ROOT).
+# build the model from a config file and a checkpoint file
+model = init_model(config_file, checkpoint_file, device='cuda:0')
 
-```shell
-conda create -n open-mmlab python=3.10 -y
-conda activate open-mmlab
+# test a single image and show the results
+img = 'demo/demo.png'  # or img = mmcv.imread(img), which will only load it once
+result = inference_model(model, img)
+# visualize the results in a new window
+show_result_pyplot(model, img, result, show=True)
+# or save the visualization results to image files
+# you can change the opacity of the painted segmentation map in (0, 1].
+show_result_pyplot(model, img, result, show=True, out_file='result.jpg', opacity=0.5)
+# test a video and show the results
+video = mmcv.VideoReader('video.mp4')
+for frame in video:
+   result = inference_model(model, frame)
+   show_result_pyplot(model, frame, result, wait_time=1)
+```
 
-conda install pytorch=1.11.0 torchvision cudatoolkit=11.3 -c pytorch
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html
-git clone https://github.com/open-mmlab/mmsegmentation.git
-cd mmsegmentation
-pip install -e .  # or "python setup.py develop"
+You can modify the code above to test a single image or a video, both of these options can verify that the installation was successful.
 
-mkdir data
-ln -s $DATA_ROOT data
-```
+### Customize Installation
 
-#### Windows(Experimental)
+#### CUDA versions
 
-Here is a full script for setting up mmsegmentation with conda and link the dataset path (supposing that your dataset path is
-%DATA_ROOT%. Notice: It must be an absolute path).
+When installing PyTorch, you need to specify the version of CUDA. If you are not clear on which to choose, follow our recommendations:
 
-```shell
-conda create -n open-mmlab python=3.10 -y
-conda activate open-mmlab
+- For Ampere-based NVIDIA GPUs, such as GeForce 30 series and NVIDIA A100, CUDA 11 is a must.
+- For older NVIDIA GPUs, CUDA 11 is backward compatible, but CUDA 10.2 offers better compatibility and is more lightweight.
 
-conda install pytorch=1.11.0 torchvision cudatoolkit=11.3 -c pytorch
-set PATH=full\path\to\your\cpp\compiler;%PATH%
-pip install mmcv
+Please make sure the GPU driver satisfies the minimum version requirements. See [this table](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions) for more information.
 
-git clone https://github.com/open-mmlab/mmsegmentation.git
-cd mmsegmentation
-pip install -e .  # or "python setup.py develop"
+**Note:**
+Installing CUDA runtime libraries is enough if you follow our best practices, because no CUDA code will be compiled locally. However if you hope to compile MMCV from source or develop other CUDA operators, you need to install the complete CUDA toolkit from NVIDIA's [website](https://developer.nvidia.com/cuda-downloads), and its version should match the CUDA version of PyTorch. i.e., the specified version of cudatoolkit in `conda install` command.
 
-mklink /D data %DATA_ROOT%
-```
+#### Install MMCV without MIM
 
-#### Developing with multiple MMSegmentation versions
+MMCV contains C++ and CUDA extensions, thus depending on PyTorch in a complex way. MIM solves such dependencies automatically and makes the installation easier. However, it is not a must.
 
-The train and test scripts already modify the `PYTHONPATH` to ensure the script use the MMSegmentation in the current directory.
+To install MMCV with pip instead of MIM, please follow [MMCV installation guides](https://mmcv.readthedocs.io/en/latest/get_started/installation.html). This requires manually specifying a find-url based on PyTorch version and its CUDA version.
 
-To use the default MMSegmentation installed in the environment rather than that you are working with, you can remove the following line in those scripts
+For example, the following command install mmcv==2.0.0 built for PyTorch 1.10.x and CUDA 11.3.
 
 ```shell
-PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
+pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
 ```
 
-## Verification
+#### Install on CPU-only platforms
 
-To verify whether MMSegmentation and the required environment are installed correctly, we can run sample python codes to initialize a segmentor and inference a demo image:
+MMSegmentation can be built for CPU only environment. In CPU mode you can train (requires MMCV version >= 2.0.0), test or inference a model.
 
-```python
-from mmseg.apis import inference_model, init_model
-import mmcv
+#### Install on Google Colab
 
-config_file = 'configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py'
-checkpoint_file = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
+[Google Colab](https://research.google.com/) usually has PyTorch installed,
+thus we only need to install MMCV and MMSegmentation with the following commands.
 
-# build the model from a config file and a checkpoint file
-model = init_model(config_file, checkpoint_file, device='cuda:0')
+**Step 1.** Install [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim).
 
-# test a single image and show the results
-img = 'test.jpg'  # or img = mmcv.imread(img), which will only load it once
-result = inference_model(model, img)
-# visualize the results in a new window
-model.show_result(img, result, show=True)
-# or save the visualization results to image files
-# you can change the opacity of the painted segmentation map in (0, 1].
-model.show_result(img, result, out_file='result.jpg', opacity=0.5)
+```shell
+!pip3 install openmim
+!mim install mmengine
+!mim install "mmcv>=2.0.0"
+```
 
-# test a video and show the results
-video = mmcv.VideoReader('video.mp4')
-for frame in video:
-   result = inference_model(model, frame)
-   model.show_result(frame, result, wait_time=1)
+**Step 2.** Install MMSegmentation from the source.
+
+```shell
+!git clone https://github.com/open-mmlab/mmsegmentation.git
+%cd mmsegmentation
+!git checkout main
+!pip install -e .
 ```
 
-The above code is supposed to run successfully upon you finish the installation.
+**Step 3.** Verification.
 
-We also provide a demo script to test a single image.
+```python
+import mmseg
+print(mmseg.__version__)
+# Example output: 1.0.0
+```
+
+**Note:**
+Within Jupyter, the exclamation mark `!` is used to call external executables and `%cd` is a [magic command](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd) to change the current working directory of Python.
+
+### Using MMSegmentation with Docker
+
+We provide a [Dockerfile](https://github.com/open-mmlab/mmsegmentation/blob/main/docker/Dockerfile) to build an image. Ensure that your [docker version](https://docs.docker.com/engine/install/) >=19.03.
 
 ```shell
-python demo/image_demo.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${DEVICE_NAME}] [--palette-thr ${PALETTE}]
+# build an image with PyTorch 1.11, CUDA 11.3
+# If you prefer other versions, just modified the Dockerfile
+docker build -t mmsegmentation docker/
 ```
 
-Examples:
+Run it with
 
 ```shell
-python demo/image_demo.py demo/demo.png configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-    checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth --device cuda:0 --palette cityscapes
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmsegmentation/data mmsegmentation
 ```
 
-A notebook demo can be found in [demo/inference_demo.ipynb](../demo/inference_demo.ipynb).
+### Optional Dependencies
 
-Now we also provide a demo script to test a single video.
+#### Install GDAL
+
+[GDAL](https://gdal.org/) is a translator library for raster and vector geospatial data formats. Install GDAL to read complex formats and extremely large remote sensing images.
 
 ```shell
-wget -O demo/demo.mp4 https://user-images.githubusercontent.com/22089207/144212749-44411ef4-b564-4b37-96d4-04bedec629ab.mp4
-python demo/video_demo.py ${VIDEO_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${DEVICE_NAME}] [--palette-thr ${PALETTE}] \
-    [--show] [--show-wait-time {SHOW_WAIT_TIME}] [--output-file {OUTPUT_FILE}] [--output-fps {OUTPUT_FPS}] \
-    [--output-height {OUTPUT_HEIGHT}] [--output-width {OUTPUT_WIDTH}] [--opacity {OPACITY}]
+conda install GDAL
 ```
 
-Examples:
+## Trouble shooting
 
-```shell
-wget -O demo/demo.mp4 https://user-images.githubusercontent.com/22089207/144212749-44411ef4-b564-4b37-96d4-04bedec629ab.mp4
-python demo/video_demo.py demo/demo.mp4 configs/cgnet/cgnet_680x680_60k_cityscapes.py \
-    checkpoints/cgnet_680x680_60k_cityscapes_20201101_110253-4c0b2f2d.pth \
-    --device cuda:0 --palette cityscapes --show
-```
+If you have some issues during the installation, please first view the [FAQ](notes/faq.md) page.
+You may [open an issue](https://github.com/open-mmlab/mmsegmentation/issues/new/choose) on GitHub if no solution is found.
diff --git a/docs/en/index.rst b/docs/en/index.rst
index ae009bf441..cdf8622f94 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -1,60 +1,60 @@
 Welcome to MMSegmentation's documentation!
-=======================================
+===========================================
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Get Started
 
+   overview.md
    get_started.md
 
 .. toctree::
-   :maxdepth: 1
-   :caption: Dataset Preparation
+   :maxdepth: 2
+   :caption: User Guides
 
-   dataset_prepare.md
+   user_guides/index.rst
 
 .. toctree::
-   :maxdepth: 1
-   :caption: Model Zoo
+   :maxdepth: 2
+   :caption: Advanced Guides
 
-   model_zoo.md
-   modelzoo_statistics.md
+   advanced_guides/index.rst
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Quick Run
+   :maxdepth: 1
+   :caption: Migration
 
-   train.md
-   inference.md
+   migration/index.rst
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Tutorials
+   :caption: API Reference
 
-   tutorials/index.rst
+   api.rst
 
 .. toctree::
-   :maxdepth: 2
-   :caption: Useful Tools and Scripts
+   :maxdepth: 1
+   :caption: Model Zoo
 
-   useful_tools.md
+   model_zoo.md
+   modelzoo_statistics.md
 
 .. toctree::
-   :maxdepth: 2
+   :maxdepth: 1
    :caption: Notes
 
-   changelog.md
-   faq.md
+   notes/changelog.md
+   notes/faq.md
 
 .. toctree::
-   :caption: Switch Language
+   :caption: Device Support
 
-   switch_language.md
+   device/npu.md
 
 .. toctree::
-   :caption: API Reference
+   :caption: Switch Language
+
+   switch_language.md
 
-   api.rst
 
 Indices and tables
 ==================
diff --git a/docs/en/inference.md b/docs/en/inference.md
deleted file mode 100644
index 6175e6efcf..0000000000
--- a/docs/en/inference.md
+++ /dev/null
@@ -1,131 +0,0 @@
-## Inference with pretrained models
-
-We provide testing scripts to evaluate a whole dataset (Cityscapes, PASCAL VOC, ADE20k, etc.),
-and also some high-level apis for easier integration to other projects.
-
-### Test a dataset
-
-- single GPU
-- CPU
-- single node multiple GPU
-- multiple node
-
-You can use the following commands to test a dataset.
-
-```shell
-# single-gpu testing
-python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show]
-
-# CPU: If GPU unavailable, directly running single-gpu testing command above
-# CPU: If GPU available, disable GPUs and run single-gpu testing script
-export CUDA_VISIBLE_DEVICES=-1
-python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}] [--show]
-
-# multi-gpu testing
-./tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [--out ${RESULT_FILE}] [--eval ${EVAL_METRICS}]
-```
-
-Optional arguments:
-
-- `RESULT_FILE`: Filename of the output results in pickle format. If not specified, the results will not be saved to a file. (After mmseg v0.17, the output results become pre-evaluation results or format result paths)
-- `EVAL_METRICS`: Items to be evaluated on the results. Allowed values depend on the dataset, e.g., `mIoU` is available for all dataset. Cityscapes could be evaluated by `cityscapes` as well as standard `mIoU` metrics.
-- `--show`: If specified, segmentation results will be plotted on the images and shown in a new window. It is only applicable to single GPU testing and used for debugging and visualization. Please make sure that GUI is available in your environment, otherwise you may encounter the error like `cannot connect to X server`.
-- `--show-dir`: If specified, segmentation results will be plotted on the images and saved to the specified directory. It is only applicable to single GPU testing and used for debugging and visualization. You do NOT need a GUI available in your environment for using this option.
-- `--eval-options`: Optional parameters for `dataset.format_results` and `dataset.evaluate` during evaluation. When `efficient_test=True`, it will save intermediate results to local files to save CPU memory. Make sure that you have enough local storage space (more than 20GB). (`efficient_test` argument does not have effect after mmseg v0.17, we use a progressive mode to evaluation and format results which can largely save memory cost and evaluation time.)
-
-Examples:
-
-Assume that you have already downloaded the checkpoints to the directory `checkpoints/`.
-
-1. Test PSPNet and visualize the results. Press any key for the next image.
-
-   ```shell
-   python tools/test.py configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-       checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-       --show
-   ```
-
-2. Test PSPNet and save the painted images for latter visualization.
-
-   ```shell
-   python tools/test.py configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-       checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-       --show-dir psp_r50_512x1024_40ki_cityscapes_results
-   ```
-
-3. Test PSPNet on PASCAL VOC (without saving the test results) and evaluate the mIoU.
-
-   ```shell
-   python tools/test.py configs/pspnet/pspnet_r50-d8_512x1024_20k_voc12aug.py \
-       checkpoints/pspnet_r50-d8_512x1024_20k_voc12aug_20200605_003338-c57ef100.pth \
-       --eval mAP
-   ```
-
-4. Test PSPNet with 4 GPUs, and evaluate the standard mIoU and cityscapes metric.
-
-   ```shell
-   ./tools/dist_test.sh configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-       checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-       4 --out results.pkl --eval mIoU cityscapes
-   ```
-
-:::{note}
-There is some gap (~0.1%) between cityscapes mIoU and our mIoU. The reason is that cityscapes average each class with class size by default.
-We use the simple version without average for all datasets.
-:::
-
-5. Test PSPNet on cityscapes test split with 4 GPUs, and generate the png files to be submit to the official evaluation server.
-
-   First, add following to config file `configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py`,
-
-   ```python
-   data = dict(
-       test=dict(
-           img_dir='leftImg8bit/test',
-           ann_dir='gtFine/test'))
-   ```
-
-   Then run test.
-
-   ```shell
-   ./tools/dist_test.sh configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-       checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-       4 --format-only --eval-options "imgfile_prefix=./pspnet_test_results"
-   ```
-
-   You will get png files under `./pspnet_test_results` directory.
-   You may run `zip -r results.zip pspnet_test_results/` and submit the zip file to [evaluation server](https://www.cityscapes-dataset.com/submit/).
-
-6. CPU memory efficient test DeeplabV3+ on Cityscapes (without saving the test results) and evaluate the mIoU.
-
-   ```shell
-   python tools/test.py \
-   configs/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py \
-   deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth \
-   --eval-options efficient_test=True \
-   --eval mIoU
-   ```
-
-   Using `pmap` to view CPU memory footprint, it used 2.25GB CPU memory with `efficient_test=True` and 11.06GB CPU memory with `efficient_test=False` . This optional parameter can save a lot of memory. (After mmseg v0.17, efficient_test has not effect and we use a progressive mode to evaluation and format results efficiently by default.)
-
-7. Test PSPNet on LoveDA test split with 1 GPU, and generate the png files to be submit to the official evaluation server.
-
-   First, add following to config file `configs/pspnet/pspnet_r50-d8_512x512_80k_loveda.py`,
-
-   ```python
-   data = dict(
-       test=dict(
-           img_dir='img_dir/test',
-           ann_dir='ann_dir/test'))
-   ```
-
-   Then run test.
-
-   ```shell
-   python ./tools/test.py configs/pspnet/pspnet_r50-d8_512x512_80k_loveda.py \
-       checkpoints/pspnet_r50-d8_512x512_80k_loveda_20211104_155728-88610f9f.pth \
-       --format-only --eval-options "imgfile_prefix=./pspnet_test_results"
-   ```
-
-   You will get png files under `./pspnet_test_results` directory.
-   You may run `zip -r -j Results.zip pspnet_test_results/` and submit the zip file to [evaluation server](https://codalab.lisn.upsaclay.fr/competitions/421).
diff --git a/docs/en/migration/index.rst b/docs/en/migration/index.rst
new file mode 100644
index 0000000000..2843bdbcfb
--- /dev/null
+++ b/docs/en/migration/index.rst
@@ -0,0 +1,8 @@
+Migration
+***************
+
+.. toctree::
+   :maxdepth: 1
+
+   interface.md
+   package.md
diff --git a/docs/en/migration/interface.md b/docs/en/migration/interface.md
new file mode 100644
index 0000000000..b83eeb9e4d
--- /dev/null
+++ b/docs/en/migration/interface.md
@@ -0,0 +1,525 @@
+# Migration from MMSegmentation 0.x
+
+## Introduction
+
+This guide describes the fundamental differences between MMSegmentation 0.x and MMSegmentation 1.x in terms of behaviors and the APIs, and how these all relate to your migration journey.
+
+## New dependencies
+
+MMSegmentation 1.x depends on some new packages, you can prepare a new clean environment and install again according to the [installation tutorial](../get_started.md).
+
+Or install the below packages manually.
+
+1. [MMEngine](https://github.com/open-mmlab/mmengine): MMEngine is the core the OpenMMLab 2.0 architecture, and we splited many compentents unrelated to computer vision from MMCV to MMEngine.
+
+2. [MMCV](https://github.com/open-mmlab/mmcv): The computer vision package of OpenMMLab. This is not a new dependency, but you need to upgrade it to **2.0.0** version or above.
+
+3. [MMClassification](https://github.com/open-mmlab/mmclassification)(Optional): The image classification toolbox and benchmark of OpenMMLab. This is not a new dependency, but you need to upgrade it to **1.0.0rc6** version.
+
+4. [MMDetection](https://github.com/open-mmlab/mmdetection)(Optional): The object detection toolbox and benchmark of OpenMMLab. This is not a new dependency, but you need to upgrade it to **3.0.0** version or above.
+
+## Train launch
+
+The main improvement of OpenMMLab 2.0 is releasing MMEngine which provides universal and powerful runner for unified interfaces to launch training jobs.
+
+Compared with MMSeg0.x, MMSeg1.x provides fewer command line arguments in `tools/train.py`
+
+<table class="docutils">
+<tr>
+<td>Function</td>
+<td>Original</td>
+<td>New</td>
+</tr>
+<tr>
+<td>Loading pre-trained checkpoint</td>
+<td>--load_from=$CHECKPOINT</td>
+<td>--cfg-options load_from=$CHECKPOINT</td>
+</tr>
+<tr>
+<td>Resuming Train from specific checkpoint</td>
+<td>--resume-from=$CHECKPOINT</td>
+<td>--resume=$CHECKPOINT</td>
+</tr>
+<tr>
+<td>Resuming Train from the latest checkpoint</td>
+<td>--auto-resume</td>
+<td>--resume='auto'</td>
+</tr>
+<tr>
+<td>Whether not to evaluate the checkpoint during training</td>
+<td>--no-validate</td>
+<td>--cfg-options val_cfg=None val_dataloader=None val_evaluator=None</td>
+</tr>
+<tr>
+<td>Training device assignment</td>
+<td>--gpu-id=$DEVICE_ID</td>
+<td>-</td>
+</tr>
+<tr>
+<td>Whether or not set different seeds for different ranks</td>
+<td>--diff-seed</td>
+<td>--cfg-options randomness.diff_rank_seed=True</td>
+</tr>
+<td>Whether to set deterministic options for CUDNN backend</td>
+<td>--deterministic</td>
+<td>--cfg-options randomness.deterministic=True</td>
+</table>
+
+## Test launch
+
+Similar to training launch, there are only common arguments in tools/test.py of MMSegmentation 1.x.
+Below is the difference in test scripts,
+please refer to [this documentation](../user_guides/4_train_test.md) for more details about test launch.
+
+<table class="docutils">
+<tr>
+<td>Function</td>
+<td>0.x</td>
+<td>1.x</td>
+</tr>
+<tr>
+<td>Evaluation metrics</td>
+<td>--eval mIoU</td>
+<td>--cfg-options test_evaluator.type=IoUMetric</td>
+</tr>
+<tr>
+<td>Whether to use test time augmentation</td>
+<td>--aug-test</td>
+<td>--tta</td>
+</tr>
+<tr>
+<td>Whether save the output results without perform evaluation</td>
+<td>--format-only</td>
+<td>--cfg-options test_evaluator.format_only=True</td>
+</tr>
+</table>
+
+## Configuration file
+
+### Model settings
+
+No changes in `model.backbone`, `model.neck`, `model.decode_head` and `model.losses` fields.
+
+Add `model.data_preprocessor` field to configure the `DataPreProcessor`, including:
+
+- `mean` (Sequence, optional): The pixel mean of R, G, B channels. Defaults to None.
+
+- `std` (Sequence, optional): The pixel standard deviation of R, G, B channels. Defaults to None.
+
+- `size` (Sequence, optional): Fixed padding size.
+
+- `size_divisor` (int, optional): The divisor of padded size.
+
+- `seg_pad_val` (float, optional): Padding value of segmentation map. Default: 255.
+
+- `padding_mode` (str): Type of padding. Default: 'constant'.
+
+  - constant: pads with a constant value, this value is specified with pad_val.
+
+- `bgr_to_rgb` (bool): whether to convert image from BGR to RGB.Defaults to False.
+
+- `rgb_to_bgr` (bool): whether to convert image from RGB to BGR. Defaults to False.
+
+**Note:**
+Please refer [models documentation](../advanced_guides/models.md) for more details.
+
+### Dataset settings
+
+Changes in **data**:
+
+The original `data` field is split to `train_dataloader`, `val_dataloader` and `test_dataloader`. This allows us to configure them in fine-grained. For example, you can specify different sampler and batch size during training and test.
+The `samples_per_gpu` is renamed to `batch_size`.
+The `workers_per_gpu` is renamed to `num_workers`.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(...),
+    val=dict(...),
+    test=dict(...),
+)
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=True)  # necessary
+)
+
+val_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=False)  # necessary
+)
+
+test_dataloader = val_dataloader
+```
+
+</td>
+</tr>
+</table>
+
+Changes in **pipeline**
+
+- The original formatting transforms **`ToTensor`**、**`ImageToTensor`**、**`Collect`** are combined as [`PackSegInputs`](mmseg.datasets.transforms.PackSegInputs)
+- We don't recommend to do **`Normalize`** and **Pad** in the dataset pipeline. Please remove it from pipelines and set it in the `data_preprocessor` field.
+- The original **`Resize`** in MMSeg 1.x has been changed to **`RandomResize`** and the input arguments `img_scale` is renamed to `scale`, and the default value of `keep_ratio` is modified to False.
+- The original `test_pipeline` combines single-scale test and multi-scale test together, in MMSeg 1.x we separate it into `test_pipeline` and `tta_pipeline`.
+
+**Note:**
+We move some work of data transforms to the data preprocessor, like normalization, see [the documentation](package.md) for more details.
+
+train_pipeline
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='Resize', img_scale=(2560, 640), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2560, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+```
+
+</td>
+</tr>
+</table>
+
+test_pipeline
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2560, 640),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+```
+
+</td>
+</tr>
+</table>
+
+Changes in **`evaluation`**:
+
+- The **`evaluation`** field is split to `val_evaluator` and `test_evaluator`. And it won't support `interval` and `save_best` arguments.
+  The `interval` is moved to `train_cfg.val_interval`, and the `save_best` is moved to `default_hooks.checkpoint.save_best`. `pre_eval` has been removed.
+- `'mIoU'` has been changed to `'IoUMetric'`.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+evaluation = dict(interval=2000, metric='mIoU', pre_eval=True)
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
+```
+
+</td>
+</tr>
+</table>
+
+### Optimizer and Schedule settings
+
+Changes in **`optimizer`** and **`optimizer_config`**:
+
+- Now we use `optim_wrapper` field to specify all configuration about the optimization process. And the `optimizer` is a sub field of `optim_wrapper` now.
+- `paramwise_cfg` is also a sub field of `optim_wrapper`, instead of `optimizer`.
+- `optimizer_config` is removed now, and all configurations of it are moved to `optim_wrapper`.
+- `grad_clip` is renamed to `clip_grad`.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0005),
+    clip_grad=dict(max_norm=1, norm_type=2))
+```
+
+</td>
+</tr>
+</table>
+
+Changes in **`lr_config`**:
+
+- The `lr_config` field is removed and we use new `param_scheduler` to replace it.
+- The `warmup` related arguments are removed, since we use schedulers combination to implement this functionality.
+
+The new schedulers combination mechanism is very flexible, and you can use it to design many kinds of learning rate / momentum curves. See [the tutorial](TODO) for more details.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+lr_config = dict(
+    policy='poly',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=1e-6,
+    power=1.0,
+    min_lr=0.0,
+    by_epoch=False)
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+```
+
+</td>
+</tr>
+</table>
+
+Changes in **`runner`**:
+
+Most configuration in the original `runner` field is moved to `train_cfg`, `val_cfg` and `test_cfg`, which configure the loop in training, validation and test.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+runner = dict(type='IterBasedRunner', max_iters=20000)
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+# The `val_interval` is the original `evaluation.interval`.
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=20000, val_interval=2000)
+val_cfg = dict(type='ValLoop') # Use the default validation loop.
+test_cfg = dict(type='TestLoop') # Use the default test loop.
+```
+
+</td>
+</tr>
+</table>
+
+In fact, in OpenMMLab 2.0, we introduced `Loop` to control the behaviors in training, validation and test. The functionalities of `Runner` are also changed. You can find more details of [runner tutorial](https://github.com/open-mmlab/mmengine/blob/main/docs/en/design/runner.md) in [MMEngine](https://github.com/open-mmlab/mmengine/).
+
+### Runtime settings
+
+Changes in **`checkpoint_config`** and **`log_config`**:
+
+The `checkpoint_config` are moved to `default_hooks.checkpoint` and the `log_config` are moved to `default_hooks.logger`.
+And we move many hooks settings from the script code to the `default_hooks` field in the runtime configuration.
+
+```python
+default_hooks = dict(
+    # record the time of every iterations.
+    timer=dict(type='IterTimerHook'),
+
+    # print log every 50 iterations.
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+
+    # enable the parameter scheduler.
+    param_scheduler=dict(type='ParamSchedulerHook'),
+
+    # save checkpoint every 2000 iterations.
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+
+    # set sampler seed in distributed environment.
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+
+    # validation results visualization.
+    visualization=dict(type='SegVisualizationHook'))
+```
+
+In addition, we split the original logger to logger and visualizer. The logger is used to record information and the visualizer is used to show the logger in different backends, like terminal and TensorBoard.
+
+<table class="docutils">
+<tr>
+<td>Original</td>
+<td>
+
+```python
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+```
+
+</td>
+<tr>
+<td>New</td>
+<td>
+
+```python
+default_hooks = dict(
+    ...
+    logger=dict(type='LoggerHook', interval=100),
+)
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
+
+</td>
+</tr>
+</table>
+
+Changes in **`load_from`** and **`resume_from`**:
+
+- The `resume_from` is removed. And we use `resume` and `load_from` to replace it.
+  - If `resume=True` and `load_from` is **not None**, resume training from the checkpoint in `load_from`.
+  - If `resume=True` and `load_from` is **None**, try to resume from the latest checkpoint in the work directory.
+  - If `resume=False` and `load_from` is **not None**, only load the checkpoint, not resume training.
+  - If `resume=False` and `load_from` is **None**, do not load nor resume.
+
+Changes in **`dist_params`**: The `dist_params` field is a sub field of `env_cfg` now. And there are some new configurations in the `env_cfg`.
+
+```python
+env_cfg = dict(
+    # whether to enable cudnn benchmark
+    cudnn_benchmark=False,
+
+    # set multi process parameters
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+
+    # set distributed parameters
+    dist_cfg=dict(backend='nccl'),
+)
+```
+
+Changes in **`workflow`**: `workflow` related functionalities are removed.
+
+New field **`visualizer`**: The visualizer is a new design in OpenMMLab 2.0 architecture. We use a visualizer instance in the runner to handle results & log visualization and save to different backends. See the [visualization tutorial](../user_guides/visualization.md) for more details.
+
+New field **`default_scope`**: The start point to search module for all registries. The `default_scope` in MMSegmentation is `mmseg`. See [the registry tutorial](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/registry.md) for more details.
diff --git a/docs/en/migration/package.md b/docs/en/migration/package.md
new file mode 100644
index 0000000000..728e9a9bb6
--- /dev/null
+++ b/docs/en/migration/package.md
@@ -0,0 +1,113 @@
+# Package structures changes
+
+This section is included if you are curious about what has changed between MMSeg 0.x and 1.x.
+
+<table>
+<tr>
+<td>MMSegmentation 0.x</td>
+<td>MMSegmentation 1.x</td>
+</tr>
+<tr>
+<td>mmseg.api</td>
+<td>mmseg.api</td>
+</tr>
+<tr>
+<td bgcolor=#fcf7f7>- mmseg.core</td>
+<td bgcolor=#ecf4eb>+ mmseg.engine</td>
+</tr>
+<tr>
+<td>mmseg.datasets</td>
+<td>mmseg.datasets</td>
+</tr>
+<tr>
+<td>mmseg.models</td>
+<td>mmseg.models</td>
+</tr>
+<tr>
+<td bgcolor=#fcf7f7>- mmseg.ops</td>
+<td bgcolor=#ecf4eb>+ mmseg.structure</td>
+</tr>
+<tr>
+<td>mmseg.utils</td>
+<td>mmseg.utils</td>
+</tr>
+<tr>
+<td></td>
+<td bgcolor=#ecf4eb>+ mmseg.evaluation</td>
+</tr>
+<tr>
+<td></td>
+<td bgcolor=#ecf4eb>+ mmseg.registry</td>
+<tr>
+</table>
+
+## Removed packages
+
+### `mmseg.core`
+
+In OpenMMLab 2.0, `core` package has been removed. `hooks` and `optimizers` of `core` are moved in `mmseg.engine`, and `evaluation` in `core` is mmseg.evaluation currently.
+
+## `mmseg.ops`
+
+`ops` package included `encoding` and `wrappers`, which are moved in `mmseg.models.utils`.
+
+## Added packages
+
+### `mmseg.engine`
+
+OpenMMLab 2.0 adds a new foundational library for training deep learning, MMEngine. It servers as the training engine of all OpenMMLab codebases.
+`engine` package of mmseg is some customized modules for semantic segmentation task, like `SegVisualizationHook` which works for visualizing segmentation mask.
+
+### `mmseg.structure`
+
+In OpenMMLab 2.0, we designed data structure for computer vision task, and in mmseg, we implements `SegDataSample` in `structure` package.
+
+### `mmseg.evaluation`
+
+We move all evaluation metric in `mmseg.evaluation`.
+
+### `mmseg.registry`
+
+We moved registry implementations for all kinds of modules in MMSegmentation in `mmseg.registry`.
+
+## Modified packages
+
+### `mmseg.apis`
+
+OpenMMLab 2.0 tries to support unified interface for multitasking of Computer Vision, and releases much stronger [`Runner`](https://github.com/open-mmlab/mmengine/blob/main/docs/en/design/runner.md), so MMSeg 1.x removed modules in `train.py` and `test.py` renamed `init_segmentor` to `init_model` and `inference_segmentor` to `inference_model`.
+
+Here is the changes of `mmseg.apis`:
+
+|       Function        | Changes                                         |
+| :-------------------: | :---------------------------------------------- |
+|   `init_segmentor`    | Renamed to `init_model`                         |
+| `inference_segmentor` | Rename to `inference_model`                     |
+| `show_result_pyplot`  | Implemented based on `SegLocalVisualizer`       |
+|     `train_model`     | Removed, use `runner.train` to train.           |
+|   `multi_gpu_test`    | Removed, use `runner.test` to test.             |
+|   `single_gpu_test`   | Removed, use `runner.test` to test.             |
+|   `set_random_seed`   | Removed, use `mmengine.runner.set_random_seed`. |
+|  `init_random_seed`   | Removed, use `mmengine.dist.sync_random_seed`.  |
+
+### `mmseg.datasets`
+
+OpenMMLab 2.0 defines the `BaseDataset` to function and interface of dataset, and MMSegmentation 1.x also follow this protocol and defines the `BaseSegDataset` inherited from `BaseDataset`. MMCV 2.x collects general data transforms for multiple tasks e.g. classification, detection, segmentation, so MMSegmentation 1.x uses these data transforms and removes them from mmseg.datasets.
+
+|   Packages/Modules    | Changes                                                                                     |
+| :-------------------: | :------------------------------------------------------------------------------------------ |
+|   `mmseg.pipelines`   | Moved in `mmcv.transforms`                                                                  |
+|    `mmseg.sampler`    | Moved in `mmengine.dataset.sampler`                                                         |
+|    `CustomDataset`    | Renamed to `BaseSegDataset` and inherited from `BaseDataset` in MMEngine                    |
+| `DefaultFormatBundle` | Replaced with `PackSegInputs`                                                               |
+|  `LoadImageFromFile`  | Moved in `mmcv.transforms.LoadImageFromFile`                                                |
+|   `LoadAnnotations`   | Moved in `mmcv.transforms.LoadAnnotations`                                                  |
+|       `Resize`        | Moved in `mmcv.transforms` and split into `Resize`, `RandomResize` and `RandomChoiceResize` |
+|     `RandomFlip`      | Moved in `mmcv.transforms.RandomFlip`                                                       |
+|         `Pad`         | Moved in `mmcv.transforms.Pad`                                                              |
+|      `Normalize`      | Moved in `mmcv.transforms.Normalize`                                                        |
+|       `Compose`       | Moved in `mmcv.transforms.Compose`                                                          |
+|    `ImageToTensor`    | Moved in `mmcv.transforms.ImageToTensor`                                                    |
+
+### `mmseg.models`
+
+`models` has not changed a lot, just added the `encoding` and `wrappers` from previous `mmseg.ops`
diff --git a/docs/en/model_zoo.md b/docs/en/model_zoo.md
index 782a47002f..6717df6cc7 100644
--- a/docs/en/model_zoo.md
+++ b/docs/en/model_zoo.md
@@ -34,123 +34,123 @@
 
 ### FCN
 
-Please refer to [FCN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn) for details.
+Please refer to [FCN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn) for details.
 
 ### PSPNet
 
-Please refer to [PSPNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet) for details.
+Please refer to [PSPNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet) for details.
 
 ### DeepLabV3
 
-Please refer to [DeepLabV3](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3) for details.
+Please refer to [DeepLabV3](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3) for details.
 
 ### PSANet
 
-Please refer to [PSANet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet) for details.
+Please refer to [PSANet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet) for details.
 
 ### DeepLabV3+
 
-Please refer to [DeepLabV3+](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus) for details.
+Please refer to [DeepLabV3+](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus) for details.
 
 ### UPerNet
 
-Please refer to [UPerNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet) for details.
+Please refer to [UPerNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet) for details.
 
 ### NonLocal Net
 
-Please refer to [NonLocal Net](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net) for details.
+Please refer to [NonLocal Net](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net) for details.
 
 ### EncNet
 
-Please refer to [EncNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet) for details.
+Please refer to [EncNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet) for details.
 
 ### CCNet
 
-Please refer to [CCNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet) for details.
+Please refer to [CCNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet) for details.
 
 ### DANet
 
-Please refer to [DANet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet) for details.
+Please refer to [DANet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet) for details.
 
 ### APCNet
 
-Please refer to [APCNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet) for details.
+Please refer to [APCNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet) for details.
 
 ### HRNet
 
-Please refer to [HRNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet) for details.
+Please refer to [HRNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet) for details.
 
 ### GCNet
 
-Please refer to [GCNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet) for details.
+Please refer to [GCNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet) for details.
 
 ### DMNet
 
-Please refer to [DMNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet) for details.
+Please refer to [DMNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet) for details.
 
 ### ANN
 
-Please refer to [ANN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann) for details.
+Please refer to [ANN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann) for details.
 
 ### OCRNet
 
-Please refer to [OCRNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet) for details.
+Please refer to [OCRNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet) for details.
 
 ### Fast-SCNN
 
-Please refer to [Fast-SCNN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastscnn) for details.
+Please refer to [Fast-SCNN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastscnn) for details.
 
 ### ResNeSt
 
-Please refer to [ResNeSt](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest) for details.
+Please refer to [ResNeSt](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest) for details.
 
 ### Semantic FPN
 
-Please refer to [Semantic FPN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/sem_fpn) for details.
+Please refer to [Semantic FPN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn) for details.
 
 ### PointRend
 
-Please refer to [PointRend](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/point_rend) for details.
+Please refer to [PointRend](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend) for details.
 
 ### MobileNetV2
 
-Please refer to [MobileNetV2](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2) for details.
+Please refer to [MobileNetV2](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2) for details.
 
 ### MobileNetV3
 
-Please refer to [MobileNetV3](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v3) for details.
+Please refer to [MobileNetV3](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3) for details.
 
 ### EMANet
 
-Please refer to [EMANet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet) for details.
+Please refer to [EMANet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/emanet) for details.
 
 ### DNLNet
 
-Please refer to [DNLNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet) for details.
+Please refer to [DNLNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet) for details.
 
 ### CGNet
 
-Please refer to [CGNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/cgnet) for details.
+Please refer to [CGNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/cgnet) for details.
 
 ### Mixed Precision (FP16) Training
 
-Please refer [Mixed Precision (FP16) Training on BiSeNetV2](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes.py) for details.
+Please refer [Mixed Precision (FP16) Training on BiSeNetV2](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2/bisenetv2_fcn_4xb4-160k_cityscapes-1024x1024.py) for details.
 
 ### U-Net
 
-Please refer to [U-Net](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet/README.md) for details.
+Please refer to [U-Net](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/README.md) for details.
 
 ### ViT
 
-Please refer to [ViT](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit/README.md) for details.
+Please refer to [ViT](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/README.md) for details.
 
 ### Swin
 
-Please refer to [Swin](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin/README.md) for details.
+Please refer to [Swin](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/swin/README.md) for details.
 
 ### SETR
 
-Please refer to [SETR](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr/README.md) for details.
+Please refer to [SETR](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr/README.md) for details.
 
 ## Speed benchmark
 
diff --git a/docs/en/modelzoo_statistics.md b/docs/en/modelzoo_statistics.md
new file mode 100644
index 0000000000..e5e21a1474
--- /dev/null
+++ b/docs/en/modelzoo_statistics.md
@@ -0,0 +1,102 @@
+# Model Zoo Statistics
+
+- Number of papers: 47
+
+  - ALGORITHM: 36
+  - BACKBONE: 11
+
+- Number of checkpoints: 612
+
+  - \[ALGORITHM\] [ANN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ann) (16 ckpts)
+
+  - \[ALGORITHM\] [APCNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/apcnet) (12 ckpts)
+
+  - \[BACKBONE\] [BEiT](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/beit) (2 ckpts)
+
+  - \[ALGORITHM\] [BiSeNetV1](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv1) (11 ckpts)
+
+  - \[ALGORITHM\] [BiSeNetV2](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/bisenetv2) (4 ckpts)
+
+  - \[ALGORITHM\] [CCNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ccnet) (16 ckpts)
+
+  - \[ALGORITHM\] [CGNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/cgnet) (2 ckpts)
+
+  - \[BACKBONE\] [ConvNeXt](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/convnext) (6 ckpts)
+
+  - \[ALGORITHM\] [DANet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/danet) (16 ckpts)
+
+  - \[ALGORITHM\] [DeepLabV3](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3) (41 ckpts)
+
+  - \[ALGORITHM\] [DeepLabV3+](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/deeplabv3plus) (42 ckpts)
+
+  - \[ALGORITHM\] [DMNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dmnet) (12 ckpts)
+
+  - \[ALGORITHM\] [DNLNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dnlnet) (12 ckpts)
+
+  - \[ALGORITHM\] [DPT](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/dpt) (1 ckpts)
+
+  - \[ALGORITHM\] [EMANet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/emanet) (4 ckpts)
+
+  - \[ALGORITHM\] [EncNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/encnet) (12 ckpts)
+
+  - \[ALGORITHM\] [ERFNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/erfnet) (1 ckpts)
+
+  - \[ALGORITHM\] [FastFCN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastfcn) (12 ckpts)
+
+  - \[ALGORITHM\] [Fast-SCNN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fastscnn) (1 ckpts)
+
+  - \[ALGORITHM\] [FCN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn) (41 ckpts)
+
+  - \[ALGORITHM\] [GCNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/gcnet) (16 ckpts)
+
+  - \[BACKBONE\] [HRNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/hrnet) (37 ckpts)
+
+  - \[ALGORITHM\] [ICNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/icnet) (12 ckpts)
+
+  - \[ALGORITHM\] [ISANet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/isanet) (16 ckpts)
+
+  - \[ALGORITHM\] [K-Net](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/knet) (7 ckpts)
+
+  - \[BACKBONE\] [MAE](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mae) (1 ckpts)
+
+  - \[ALGORITHM\] [Mask2Former](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mask2former) (13 ckpts)
+
+  - \[ALGORITHM\] [MaskFormer](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/maskformer) (4 ckpts)
+
+  - \[BACKBONE\] [MobileNetV2](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v2) (8 ckpts)
+
+  - \[BACKBONE\] [MobileNetV3](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/mobilenet_v3) (4 ckpts)
+
+  - \[ALGORITHM\] [NonLocal Net](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/nonlocal_net) (16 ckpts)
+
+  - \[ALGORITHM\] [OCRNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/ocrnet) (24 ckpts)
+
+  - \[ALGORITHM\] [PointRend](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/point_rend) (4 ckpts)
+
+  - \[BACKBONE\] [PoolFormer](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/poolformer) (5 ckpts)
+
+  - \[ALGORITHM\] [PSANet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/psanet) (16 ckpts)
+
+  - \[ALGORITHM\] [PSPNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/pspnet) (54 ckpts)
+
+  - \[BACKBONE\] [ResNeSt](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/resnest) (8 ckpts)
+
+  - \[ALGORITHM\] [SegFormer](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segformer) (13 ckpts)
+
+  - \[ALGORITHM\] [Segmenter](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/segmenter) (5 ckpts)
+
+  - \[ALGORITHM\] [Semantic FPN](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/sem_fpn) (4 ckpts)
+
+  - \[ALGORITHM\] [SETR](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/setr) (7 ckpts)
+
+  - \[ALGORITHM\] [STDC](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/stdc) (4 ckpts)
+
+  - \[BACKBONE\] [Swin Transformer](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/swin) (6 ckpts)
+
+  - \[BACKBONE\] [Twins](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/twins) (12 ckpts)
+
+  - \[ALGORITHM\] [UNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet) (25 ckpts)
+
+  - \[ALGORITHM\] [UPerNet](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/upernet) (16 ckpts)
+
+  - \[BACKBONE\] [Vision Transformer](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit) (11 ckpts)
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
new file mode 100644
index 0000000000..8eaf3329b2
--- /dev/null
+++ b/docs/en/notes/changelog.md
@@ -0,0 +1,506 @@
+# Changelog of v1.x
+
+## v1.2.2 (12/14/2023)
+
+### Bug Fixes
+
+- Fix bug in cross entropy loss ([#3457](https://github.com/open-mmlab/mmsegmentation/pull/3457))
+- Allow custom visualizer ([#3455](https://github.com/open-mmlab/mmsegmentation/pull/3455))
+- test resize with pad_shape ([#3421](https://github.com/open-mmlab/mmsegmentation/pull/3421))
+- add with-labels args to inferencer for visualization without labels ([#3466](https://github.com/open-mmlab/mmsegmentation/pull/3466))
+
+### New Contributors
+
+- @okotaku made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3421
+
+## v1.2.1 (10/17/2023)
+
+### Bug Fixes
+
+- Add bpe_simple_vocab_16e6.txt.gz to release ([#3386](https://github.com/open-mmlab/mmsegmentation/pull/3386))
+- Fix init api ([#3388](https://github.com/open-mmlab/mmsegmentation/pull/3388))
+
+## v1.2.0 (10/12/2023)
+
+### Features
+
+- Support Side Adapter Network ([#3232](https://github.com/open-mmlab/mmsegmentation/pull/3232))
+
+### Bug Fixes
+
+- fix wrong variables passing for `set_dataset_meta` ([#3348](https://github.com/open-mmlab/mmsegmentation/pull/3348))
+
+### Documentation
+
+- add documentation of Finetune ONNX Models (MMSegemetation) Inference for NVIDIA Jetson ([#3372](https://github.com/open-mmlab/mmsegmentation/pull/3372))
+
+## v1.1.2(09/20/2023)
+
+### Features
+
+- Add semantic label to the segmentation visualization results ([#3229](https://github.com/open-mmlab/mmsegmentation/pull/3229))
+- Support NYU depth estimation dataset ([#3269](https://github.com/open-mmlab/mmsegmentation/pull/3269))
+- Support Kullback-Leibler divergence Loss ([#3242](https://github.com/open-mmlab/mmsegmentation/pull/3242))
+- Support depth metrics ([#3297](https://github.com/open-mmlab/mmsegmentation/pull/3297))
+- Support Remote sensing inferencer ([#3131](https://github.com/open-mmlab/mmsegmentation/pull/3131))
+- Support VPD Depth Estimator ((#3321)(https://github.com/open-mmlab/mmsegmentation/pull/3321))
+- Support inference and visualization of VPD ([#3331](https://github.com/open-mmlab/mmsegmentation/pull/3331))
+- Support using the pytorch-grad-cam tool to visualize Class Activation Maps (CAM) ([#3324](https://github.com/open-mmlab/mmsegmentation/pull/3324))
+
+### New projects
+
+- Support PP-Mobileseg ([#3239](https://github.com/open-mmlab/mmsegmentation/pull/3239))
+- Support CAT-Seg (CVPR'2023) ([#3098](https://github.com/open-mmlab/mmsegmentation/pull/3098))
+- Support Adabins ([#3257](https://github.com/open-mmlab/mmsegmentation/pull/3257))
+- Add pp_mobileseg onnx inference script ([#3268](https://github.com/open-mmlab/mmsegmentation/pull/3268))
+
+### Bug Fixes
+
+- Fix module PascalContextDataset ([#3235](https://github.com/open-mmlab/mmsegmentation/pull/3235))
+- Fix one hot encoding for dice loss ([#3237](https://github.com/open-mmlab/mmsegmentation/pull/3237))
+- Fix confusion_matrix.py ([#3291](https://github.com/open-mmlab/mmsegmentation/pull/3291))
+- Fix inferencer visualization ([#3333](https://github.com/open-mmlab/mmsegmentation/pull/3333))
+
+### Documentation
+
+- Translate doc for docs/zh_cn/user_guides/5_deployment.md ([#3281](https://github.com/open-mmlab/mmsegmentation/pull/3281))
+
+### New Contributors
+
+- @angiecao made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3235
+- @yeedrag made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3237
+- @Yang-Changhui made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3239
+- @ooooo-create made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3261
+- @Ben-Louis made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3269
+- @crazysteeaam made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3284
+- @zen0no made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3242
+- @XiandongWang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3291
+- @ZhaoQiiii made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3332
+- @zhen6618 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3324
+
+## v1.1.1(07/24/2023)
+
+### Features
+
+- Add bdd100K datasets ([#3158](https://github.com/open-mmlab/mmsegmentation/pull/3158))
+- Remove batch inference assertion ([#3210](https://github.com/open-mmlab/mmsegmentation/pull/3210))
+
+### Bug Fixes
+
+- Fix train map path for coco-stuff164k.py ([#3187](https://github.com/open-mmlab/mmsegmentation/pull/3187))
+- Fix mim search error ([#3194](https://github.com/open-mmlab/mmsegmentation/pull/3194))
+- Fix SegTTAModel with no attribute '\_gt_sem_seg' error ([#3152](https://github.com/open-mmlab/mmsegmentation/pull/3152))
+- Fix Albumentations default key mapping mismatch ([#3195](https://github.com/open-mmlab/mmsegmentation/pull/3195))
+
+### New Contributors
+
+- @OliverGrace made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3187
+- @ZiAn-Su made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3152
+- @CastleDream made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3158
+- @coding-famer made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3174
+- @Alias-z made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3195
+
+## v1.1.0(06/28/2023)
+
+## What's Changed
+
+### Features
+
+- Support albu transform ([#2943](https://github.com/open-mmlab/mmsegmentation/pull/2943))
+- Support DDRNet ([#2855](https://github.com/open-mmlab/mmsegmentation/pull/2855))
+- Add GDAL backend and Support LEVIR-CD Dataset ([#2903](https://github.com/open-mmlab/mmsegmentation/pull/2903))
+- Support DSDL Dataset ([#2925](https://github.com/open-mmlab/mmsegmentation/pull/2925))
+- huasdorff distance loss ([#2820](https://github.com/open-mmlab/mmsegmentation/pull/2820))
+
+### New Projects
+
+- Support SAM inferencer ([#2897](https://github.com/open-mmlab/mmsegmentation/pull/2897))
+- Added a supported for Visual Attention Network (VAN) ([#2987](https://github.com/open-mmlab/mmsegmentation/pull/2987))
+- add GID dataset ([#3038](https://github.com/open-mmlab/mmsegmentation/pull/3038))
+- add Medical semantic seg dataset: Bactteria ([#2568](https://github.com/open-mmlab/mmsegmentation/pull/2568))
+- add Medical semantic seg dataset: Vampire ([#2633](https://github.com/open-mmlab/mmsegmentation/pull/2633))
+- add Medical semantic seg dataset: Ravir ([#2635](https://github.com/open-mmlab/mmsegmentation/pull/2635))
+- add Medical semantic seg dataset: Cranium ([#2675](https://github.com/open-mmlab/mmsegmentation/pull/2675))
+- add Medical semantic seg dataset: bccs ([#2861](https://github.com/open-mmlab/mmsegmentation/pull/2861))
+- add Medical semantic seg dataset: Gamma Task3 dataset ([#2695](https://github.com/open-mmlab/mmsegmentation/pull/2695))
+- add Medical semantic seg dataset: consep ([#2724](https://github.com/open-mmlab/mmsegmentation/pull/2724))
+- add Medical semantic seg dataset: breast_cancer_cell_seg dataset ([#2726](https://github.com/open-mmlab/mmsegmentation/pull/2726))
+- add Medical semantic seg dataset: chest_image_pneum dataset ([#2727](https://github.com/open-mmlab/mmsegmentation/pull/2727))
+- add Medical semantic seg dataset: conic2022 ([#2725](https://github.com/open-mmlab/mmsegmentation/pull/2725))
+- add Medical semantic seg dataset: dr_hagis ([#2729](https://github.com/open-mmlab/mmsegmentation/pull/2729))
+- add Medical semantic seg dataset: orvs ([#2728](https://github.com/open-mmlab/mmsegmentation/pull/2728))
+- add Medical semantic seg dataset: ISIC-2016 Task1 ([#2708](https://github.com/open-mmlab/mmsegmentation/pull/2708))
+- add Medical semantic seg dataset: ISIC-2017 Task1 ([#2709](https://github.com/open-mmlab/mmsegmentation/pull/2709))
+- add Medical semantic seg dataset: Kvasir seg ([#2677](https://github.com/open-mmlab/mmsegmentation/pull/2677))
+- add Medical semantic seg dataset: Kvasir seg aliyun ([#2678](https://github.com/open-mmlab/mmsegmentation/pull/2678))
+- add Medical semantic seg dataset: Rite ([#2680](https://github.com/open-mmlab/mmsegmentation/pull/2680))
+- add Medical semantic seg dataset: Fusc2021 ([#2682](https://github.com/open-mmlab/mmsegmentation/pull/2682))
+- add Medical semantic seg dataset: 2pm vessel ([#2685](https://github.com/open-mmlab/mmsegmentation/pull/2685))
+- add Medical semantic seg dataset: Pcam ([#2684](https://github.com/open-mmlab/mmsegmentation/pull/2684))
+- add Medical semantic seg dataset: Pannuke ([#2683](https://github.com/open-mmlab/mmsegmentation/pull/2683))
+- add Medical semantic seg dataset: Covid 19 ct cxr ([#2688](https://github.com/open-mmlab/mmsegmentation/pull/2688))
+- add Medical semantic seg dataset: Crass ([#2690](https://github.com/open-mmlab/mmsegmentation/pull/2690))
+- add Medical semantic seg dataset: Chest x ray images with pneumothorax masks ([#2687](https://github.com/open-mmlab/mmsegmentation/pull/2687))
+
+### Enhancement
+
+- Robust mapping from image path to seg map path ([#3091](https://github.com/open-mmlab/mmsegmentation/pull/3091))
+- Change assertion logic inference cfg.model.test_cfg ([#3012](https://github.com/open-mmlab/mmsegmentation/pull/3012))
+- Refactor dice loss ([#3002](https://github.com/open-mmlab/mmsegmentation/pull/3002))
+- Update Dockerfile libgl1-mesa-dev ([#3095](https://github.com/open-mmlab/mmsegmentation/pull/3095))
+- Prevent passed `ann_file` from silently failing to load ([#2966](https://github.com/open-mmlab/mmsegmentation/pull/2966))
+- Update the translation of models documentation ([#2833](https://github.com/open-mmlab/mmsegmentation/pull/2833))
+- Add docs contents at README.md ([#3083](https://github.com/open-mmlab/mmsegmentation/pull/3083))
+- Enhance swin pretrained model loading ([#3097](https://github.com/open-mmlab/mmsegmentation/pull/3097))
+
+### Bug Fixes
+
+- Handle case where device is neither CPU nor CUDA in HamHead ([#2868](https://github.com/open-mmlab/mmsegmentation/pull/2868))
+- Fix bugs when out_channels==1 ([#2911](https://github.com/open-mmlab/mmsegmentation/pull/2911))
+- Fix binary C=1 focal loss & dataset fileio ([#2935](https://github.com/open-mmlab/mmsegmentation/pull/2935))
+- Fix isaid dataset pre-processing tool ([#3010](https://github.com/open-mmlab/mmsegmentation/pull/3010))
+- Fix bug cannot use both '--tta' and '--out' while testing ([#3067](https://github.com/open-mmlab/mmsegmentation/pull/3067))
+- Fix inferencer ut ([#3117](https://github.com/open-mmlab/mmsegmentation/pull/3117))
+- Fix document ([#2863](https://github.com/open-mmlab/mmsegmentation/pull/2863), [#2896](https://github.com/open-mmlab/mmsegmentation/pull/2896), [#2919](https://github.com/open-mmlab/mmsegmentation/pull/2919), [#2951](https://github.com/open-mmlab/mmsegmentation/pull/2951), [#2970](https://github.com/open-mmlab/mmsegmentation/pull/2970), [#2961](https://github.com/open-mmlab/mmsegmentation/pull/2961), [#3042](https://github.com/open-mmlab/mmsegmentation/pull/3042), )
+- Fix squeeze error when N=1 and C=1 ([#2933](https://github.com/open-mmlab/mmsegmentation/pull/2933))
+
+### New Contributors
+
+- @liu-mengyang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2896
+- @likyoo made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2911
+- @1qh made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2902
+- @JoshuaChou2018 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2951
+- @jts250 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2833
+- @MGAMZ made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2970
+- @tianbinli made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2568
+- @Provable0816 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2633
+- @Zoulinx made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2903
+- @wufan-tb made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2925
+- @haruishi43 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2966
+- @Masaaki-75 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2675
+- @tang576225574 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2987
+- @Kedreamix made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3010
+- @nightrain01 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3067
+- @shigengtian made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3095
+- @SheffieldCao made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3097
+- @wangruohui made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3091
+- @LHamnett made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/3012
+
+## v1.0.0(04/06/2023)
+
+### Highlights
+
+- Add Mapillary Vistas Datasets support to MMSegmentation Core Package ([#2576](https://github.com/open-mmlab/mmsegmentation/pull/2576))
+- Support PIDNet ([#2609](https://github.com/open-mmlab/mmsegmentation/pull/2609))
+- Support SegNeXt ([#2654](https://github.com/open-mmlab/mmsegmentation/pull/2654))
+
+### Features
+
+- Support calculating FLOPs of segmentors ([#2706](https://github.com/open-mmlab/mmsegmentation/pull/2706))
+- Support multi-band image for Mosaic ([#2748](https://github.com/open-mmlab/mmsegmentation/pull/2748))
+- Support dump segment prediction ([#2712](https://github.com/open-mmlab/mmsegmentation/pull/2712))
+
+### Bug fix
+
+- Fix format_result and fix prefix param in cityscape metric, and rename CitysMetric to CityscapesMetric ([#2660](https://github.com/open-mmlab/mmsegmentation/pull/2660))
+- Support input gt seg map is not 2D ([#2739](https://github.com/open-mmlab/mmsegmentation/pull/2739))
+- Fix accepting an unexpected argument `local-rank` in PyTorch 2.0 ([#2812](https://github.com/open-mmlab/mmsegmentation/pull/2812))
+
+### Documentation
+
+- Add Chinese version of various documentation ([#2673](https://github.com/open-mmlab/mmsegmentation/pull/2673), [#2702](https://github.com/open-mmlab/mmsegmentation/pull/2702), [#2703](https://github.com/open-mmlab/mmsegmentation/pull/2703), [#2701](https://github.com/open-mmlab/mmsegmentation/pull/2701), [#2722](https://github.com/open-mmlab/mmsegmentation/pull/2722), [#2733](https://github.com/open-mmlab/mmsegmentation/pull/2733), [#2769](https://github.com/open-mmlab/mmsegmentation/pull/2769), [#2790](https://github.com/open-mmlab/mmsegmentation/pull/2790), [#2798](https://github.com/open-mmlab/mmsegmentation/pull/2798))
+- Update and refine various English documentation ([#2715](https://github.com/open-mmlab/mmsegmentation/pull/2715), [#2755](https://github.com/open-mmlab/mmsegmentation/pull/2755), [#2745](https://github.com/open-mmlab/mmsegmentation/pull/2745), [#2797](https://github.com/open-mmlab/mmsegmentation/pull/2797), [#2799](https://github.com/open-mmlab/mmsegmentation/pull/2799), [#2821](https://github.com/open-mmlab/mmsegmentation/pull/2821), [#2827](https://github.com/open-mmlab/mmsegmentation/pull/2827), [#2831](https://github.com/open-mmlab/mmsegmentation/pull/2831))
+- Add deeplabv3 model structure documentation ([#2426](https://github.com/open-mmlab/mmsegmentation/pull/2426))
+- Add custom metrics documentation ([#2799](https://github.com/open-mmlab/mmsegmentation/pull/2799))
+- Add faq in dev-1.x branch ([#2765](https://github.com/open-mmlab/mmsegmentation/pull/2765))
+
+### New Contributors
+
+- @liuruiqiang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2554
+- @wangjiangben-hw made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2569
+- @jinxianwei made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2557
+- @KKIEEK made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2747
+- @Renzhihan made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2765
+
+## v1.0.0rc6(03/03/2023)
+
+### Highlights
+
+- Support MMSegInferencer ([#2413](https://github.com/open-mmlab/mmsegmentation/pull/2413), [#2658](https://github.com/open-mmlab/mmsegmentation/pull/2658))
+- Support REFUGE dataset ([#2554](https://github.com/open-mmlab/mmsegmentation/pull/2554))
+
+### Features
+
+- Support auto import modules from registry ([#2481](https://github.com/open-mmlab/mmsegmentation/pull/2481))
+- Replace numpy ascontiguousarray with torch contiguous to speed-up ([#2604](https://github.com/open-mmlab/mmsegmentation/pull/2604))
+- Add browse_dataset.py tool ([#2649](https://github.com/open-mmlab/mmsegmentation/pull/2649))
+
+### Bug fix
+
+- Rename and Fix bug of projects HieraSeg ([#2565](https://github.com/open-mmlab/mmsegmentation/pull/2565))
+- Add out_channels  in `CascadeEncoderDecoder` and update OCRNet and MobileNet v2 results ([#2656](https://github.com/open-mmlab/mmsegmentation/pull/2656))
+
+### Documentation
+
+- Add dataflow documentation of Chinese version ([#2652](https://github.com/open-mmlab/mmsegmentation/pull/2652))
+- Add custmized runtime documentation of English version ([#2533](https://github.com/open-mmlab/mmsegmentation/pull/2533))
+- Add documentation for visualizing feature map using wandb backend ([#2557](https://github.com/open-mmlab/mmsegmentation/pull/2557))
+- Add documentation for benchmark results on NPU (HUAWEI Ascend) ([#2569](https://github.com/open-mmlab/mmsegmentation/pull/2569), [#2596](https://github.com/open-mmlab/mmsegmentation/pull/2596), [#2610](https://github.com/open-mmlab/mmsegmentation/pull/2610))
+- Fix api name error in the migration doc ([#2601](https://github.com/open-mmlab/mmsegmentation/pull/2601))
+- Refine projects documentation ([#2586](https://github.com/open-mmlab/mmsegmentation/pull/2586))
+- Refine MMSegmentation documentation ([#2668](https://github.com/open-mmlab/mmsegmentation/pull/2668), [#2659](https://github.com/open-mmlab/mmsegmentation/pull/2659))
+
+### New Contributors
+
+- @zccjjj made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2548
+- @liuruiqiang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2554
+- @wangjiangben-hw made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2569
+- @jinxianwei made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2557
+
+## v1.0.0rc5(02/01/2023)
+
+### Bug fix
+
+- Fix MaskFormer and Mask2Former when install mmdet from source ([#2532](https://github.com/open-mmlab/mmsegmentation/pull/2532))
+- Support new fileio interface in `MMCV>=2.0.0rc4` ([#2543](https://github.com/open-mmlab/mmsegmentation/pull/2543))
+- Fix ERFNet URL in dev-1.x branch ([#2537](https://github.com/open-mmlab/mmsegmentation/pull/2537))
+- Fix misleading `List[Tensor]` types ([#2546](https://github.com/open-mmlab/mmsegmentation/pull/2546))
+- Rename typing.py to typing_utils.py ([#2548](https://github.com/open-mmlab/mmsegmentation/pull/2548))
+
+## v1.0.0rc4(01/30/2023)
+
+### Highlights
+
+- Support ISNet (ICCV'2021) in projects ([#2400](https://github.com/open-mmlab/mmsegmentation/pull/2400))
+- Support HSSN (CVPR'2022) in projects ([#2444](https://github.com/open-mmlab/mmsegmentation/pull/2444))
+
+### Features
+
+- Add Gaussian Noise and Blur for biomedical data ([#2373](https://github.com/open-mmlab/mmsegmentation/pull/2373))
+- Add BioMedicalRandomGamma ([#2406](https://github.com/open-mmlab/mmsegmentation/pull/2406))
+- Add BioMedical3DPad ([#2383](https://github.com/open-mmlab/mmsegmentation/pull/2383))
+- Add BioMedical3DRandomFlip ([#2404](https://github.com/open-mmlab/mmsegmentation/pull/2404))
+- Add `gt_edge_map` field to SegDataSample ([#2466](https://github.com/open-mmlab/mmsegmentation/pull/2466))
+- Support synapse dataset ([#2432](https://github.com/open-mmlab/mmsegmentation/pull/2432), [#2465](https://github.com/open-mmlab/mmsegmentation/pull/2465))
+- Support Mapillary Vistas Dataset in projects ([#2484](https://github.com/open-mmlab/mmsegmentation/pull/2484))
+- Switch order of `reduce_zero_label` and applying `label_map` ([#2517](https://github.com/open-mmlab/mmsegmentation/pull/2517))
+
+### Documentation
+
+- Add ZN Customized_runtime Doc ([#2502](https://github.com/open-mmlab/mmsegmentation/pull/2502))
+- Add EN datasets.md ([#2464](https://github.com/open-mmlab/mmsegmentation/pull/2464))
+- Fix minor typo in migration `package.md` ([#2518](https://github.com/open-mmlab/mmsegmentation/pull/2518))
+
+### Bug fix
+
+- Fix incorrect `img_shape` value assignment in RandomCrop ([#2469](https://github.com/open-mmlab/mmsegmentation/pull/2469))
+- Fix inference api and support setting palette to SegLocalVisualizer ([#2475](https://github.com/open-mmlab/mmsegmentation/pull/2475))
+- Unfinished label conversion from `-1` to `255` ([#2516](https://github.com/open-mmlab/mmsegmentation/pull/2516))
+
+### New Contributors
+
+- @blueyo0 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2373
+- @Fivethousand5k made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2406
+- @suyanzhou626 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2383
+- @unrealMJ made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2400
+- @Dominic23331 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2432
+- @AI-Tianlong made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2444
+- @morkovka1337 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2492
+- @Leeinsn made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2404
+- @siddancha made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/2516
+
+## v1.0.0rc3(31/12/2022)
+
+### Highlights
+
+- Support test time augmentation ([#2184](https://github.com/open-mmlab/mmsegmentation/pull/2184))
+- Add 'Projects/' folder and the first example project ([#2412](https://github.com/open-mmlab/mmsegmentation/pull/2412))
+
+### Features
+
+- Add Biomedical 3D array random crop transform ([#2378](https://github.com/open-mmlab/mmsegmentation/pull/2378))
+
+### Documentation
+
+- Add Chinese version of config tutorial ([#2371](https://github.com/open-mmlab/mmsegmentation/pull/2371))
+- Add Chinese version of train & test tutorial  ([#2355](https://github.com/open-mmlab/mmsegmentation/pull/2355))
+- Add Chinese version of overview ([(#2397)](https://github.com/open-mmlab/mmsegmentation/pull/2397)))
+- Add Chinese version of get_started ([#2417](https://github.com/open-mmlab/mmsegmentation/pull/2417))
+- Add datasets in Chinese ([#2387](https://github.com/open-mmlab/mmsegmentation/pull/2387))
+- Add dataflow document ([#2403](https://github.com/open-mmlab/mmsegmentation/pull/2403))
+- Add pspnet model structure graph ([#2437](https://github.com/open-mmlab/mmsegmentation/pull/2437))
+- Update some content of engine Chinese documentation ([#2341](https://github.com/open-mmlab/mmsegmentation/pull/2341))
+- Update TTA to migration documentation ([#2335](https://github.com/open-mmlab/mmsegmentation/pull/2335))
+
+### Bug fix
+
+- Remove dependency mmdet when do not use MaskFormerHead and MMDET_Mask2FormerHead ([#2448](https://github.com/open-mmlab/mmsegmentation/pull/2448))
+
+### Enhancement
+
+- Add torch1.13 checking in CI ([#2402](https://github.com/open-mmlab/mmsegmentation/pull/2402))
+- Fix pytorch version for merge stage test  ([#2449](https://github.com/open-mmlab/mmsegmentation/pull/2449))
+
+## v1.0.0rc2(6/12/2022)
+
+### Highlights
+
+- Support MaskFormer ([#2215](https://github.com/open-mmlab/mmsegmentation/pull/2215))
+- Support Mask2Former ([#2255](https://github.com/open-mmlab/mmsegmentation/pull/2255))
+
+### Features
+
+- Add ResizeShortestEdge transform ([#2339](https://github.com/open-mmlab/mmsegmentation/pull/2339))
+- Support padding in data pre-processor for model testing([#2290](https://github.com/open-mmlab/mmsegmentation/pull/2290))
+- Fix the problem of post-processing not removing padding ([#2367](https://github.com/open-mmlab/mmsegmentation/pull/2367))
+
+### Bug fix
+
+- Fix links in README ([#2024](https://github.com/open-mmlab/mmsegmentation/pull/2024))
+- Fix swin load state_dict ([#2304](https://github.com/open-mmlab/mmsegmentation/pull/2304))
+- Fix typo of BaseSegDataset docstring ([#2322](https://github.com/open-mmlab/mmsegmentation/pull/2322))
+- Fix the bug in the visualization step ([#2326](https://github.com/open-mmlab/mmsegmentation/pull/2326))
+- Fix ignore class id from -1 to 255 in BaseSegDataset ([#2332](https://github.com/open-mmlab/mmsegmentation/pull/2332))
+- Fix KNet IterativeDecodeHead bug ([#2334](https://github.com/open-mmlab/mmsegmentation/pull/2334))
+- Add input argument for datasets ([#2379](https://github.com/open-mmlab/mmsegmentation/pull/2379))
+- Fix typo in warning on binary classification ([#2382](https://github.com/open-mmlab/mmsegmentation/pull/2382))
+
+### Enhancement
+
+- Fix ci for 1.x ([#2011](https://github.com/open-mmlab/mmsegmentation/pull/2011), [#2019](https://github.com/open-mmlab/mmsegmentation/pull/2019))
+- Fix lint and pre-commit hook ([#2308](https://github.com/open-mmlab/mmsegmentation/pull/2308))
+- Add `data` string in .gitignore file in dev-1.x branch ([#2336](https://github.com/open-mmlab/mmsegmentation/pull/2336))
+- Make scipy as a default dependency in runtime ([#2362](https://github.com/open-mmlab/mmsegmentation/pull/2362))
+- Delete mmcls in runtime.txt ([#2368](https://github.com/open-mmlab/mmsegmentation/pull/2368))
+
+### Documentation
+
+- Update configuration documentation ([#2048](https://github.com/open-mmlab/mmsegmentation/pull/2048))
+- Update inference documentation ([#2052](https://github.com/open-mmlab/mmsegmentation/pull/2052))
+- Update train test documentation ([#2061](https://github.com/open-mmlab/mmsegmentation/pull/2061))
+- Update get started documentatin ([#2148](https://github.com/open-mmlab/mmsegmentation/pull/2148))
+- Update transforms documentation ([#2088](https://github.com/open-mmlab/mmsegmentation/pull/2088))
+- Add MMEval projects like in README ([#2259](https://github.com/open-mmlab/mmsegmentation/pull/2259))
+- Translate the visualization.md ([#2298](https://github.com/open-mmlab/mmsegmentation/pull/2298))
+
+## v1.0.0rc1 (2/11/2022)
+
+### Highlights
+
+- Support PoolFormer ([#2191](https://github.com/open-mmlab/mmsegmentation/pull/2191))
+- Add Decathlon dataset ([#2227](https://github.com/open-mmlab/mmsegmentation/pull/2227))
+
+### Features
+
+- Add BioMedical data loading ([#2176](https://github.com/open-mmlab/mmsegmentation/pull/2176))
+- Add LIP dataset ([#2251](https://github.com/open-mmlab/mmsegmentation/pull/2251))
+- Add `GenerateEdge` data transform ([#2210](https://github.com/open-mmlab/mmsegmentation/pull/2210))
+
+### Bug fix
+
+- Fix segmenter-vit-s_fcn config ([#2037](https://github.com/open-mmlab/mmsegmentation/pull/2037))
+- Fix binary segmentation ([#2101](https://github.com/open-mmlab/mmsegmentation/pull/2101))
+- Fix MMSegmentation colab demo ([#2089](https://github.com/open-mmlab/mmsegmentation/pull/2089))
+- Fix ResizeToMultiple transform ([#2185](https://github.com/open-mmlab/mmsegmentation/pull/2185))
+- Use SyncBN in mobilenet_v2 ([#2198](https://github.com/open-mmlab/mmsegmentation/pull/2198))
+- Fix typo in installation ([#2175](https://github.com/open-mmlab/mmsegmentation/pull/2175))
+- Fix typo in visualization.md ([#2116](https://github.com/open-mmlab/mmsegmentation/pull/2116))
+
+### Enhancement
+
+- Add mim extras_requires in setup.py ([#2012](https://github.com/open-mmlab/mmsegmentation/pull/2012))
+- Fix CI ([#2029](https://github.com/open-mmlab/mmsegmentation/pull/2029))
+- Remove ops module ([#2063](https://github.com/open-mmlab/mmsegmentation/pull/2063))
+- Add pyupgrade pre-commit hook ([#2078](https://github.com/open-mmlab/mmsegmentation/pull/2078))
+- Add `out_file` in `add_datasample` of `SegLocalVisualizer` to directly save image ([#2090](https://github.com/open-mmlab/mmsegmentation/pull/2090))
+- Upgrade pre commit hooks ([#2154](https://github.com/open-mmlab/mmsegmentation/pull/2154))
+- Ignore test timm in CI when torch\<1.7 ([#2158](https://github.com/open-mmlab/mmsegmentation/pull/2158))
+- Update requirements ([#2186](https://github.com/open-mmlab/mmsegmentation/pull/2186))
+- Fix Windows platform CI ([#2202](https://github.com/open-mmlab/mmsegmentation/pull/2202))
+
+### Documentation
+
+- Add `Overview` documentation ([#2042](https://github.com/open-mmlab/mmsegmentation/pull/2042))
+- Add `Evaluation` documentation ([#2077](https://github.com/open-mmlab/mmsegmentation/pull/2077))
+- Add `Migration` documentation ([#2066](https://github.com/open-mmlab/mmsegmentation/pull/2066))
+- Add `Structures` documentation ([#2070](https://github.com/open-mmlab/mmsegmentation/pull/2070))
+- Add `Structures` ZN documentation ([#2129](https://github.com/open-mmlab/mmsegmentation/pull/2129))
+- Add `Engine` ZN documentation ([#2157](https://github.com/open-mmlab/mmsegmentation/pull/2157))
+- Update `Prepare datasets` and `Visualization` doc ([#2054](https://github.com/open-mmlab/mmsegmentation/pull/2054))
+- Update `Models` documentation ([#2160](https://github.com/open-mmlab/mmsegmentation/pull/2160))
+- Update `Add New Modules` documentation ([#2067](https://github.com/open-mmlab/mmsegmentation/pull/2067))
+- Fix the installation commands in get_started.md ([#2174](https://github.com/open-mmlab/mmsegmentation/pull/2174))
+- Add MMYOLO to README.md ([#2220](https://github.com/open-mmlab/mmsegmentation/pull/2220))
+
+## v1.0.0rc0 (31/8/2022)
+
+We are excited to announce the release of MMSegmentation 1.0.0rc0.
+MMSeg 1.0.0rc0 is the first version of MMSegmentation 1.x, a part of the OpenMMLab 2.0 projects.
+Built upon the new [training engine](https://github.com/open-mmlab/mmengine),
+MMSeg 1.x unifies the interfaces of dataset, models, evaluation, and visualization with faster training and testing speed.
+
+### Highlights
+
+1. **New engines** MMSeg 1.x is based on [MMEngine](https://github.com/open-mmlab/mmengine), which provides a general and powerful runner that allows more flexible customizations and significantly simplifies the entrypoints of high-level interfaces.
+
+2. **Unified interfaces** As a part of the OpenMMLab 2.0 projects, MMSeg 1.x unifies and refactors the interfaces and internal logics of train, testing, datasets, models, evaluation, and visualization. All the OpenMMLab 2.0 projects share the same design in those interfaces and logics to allow the emergence of multi-task/modality algorithms.
+
+3. **Faster speed** We optimize the training and inference speed for common models.
+
+4. **New features**:
+
+   - Support TverskyLoss function
+
+5. **More documentation and tutorials**. We add a bunch of documentation and tutorials to help users get started more smoothly. Read it [here](https://mmsegmentation.readthedocs.io/en/1.x/).
+
+### Breaking Changes
+
+We briefly list the major breaking changes here.
+We will update the [migration guide](../migration.md) to provide complete details and migration instructions.
+
+#### Training and testing
+
+- MMSeg 1.x runs on PyTorch>=1.6. We have deprecated the support of PyTorch 1.5 to embrace the mixed precision training and other new features since PyTorch 1.6. Some models can still run on PyTorch 1.5, but the full functionality of MMSeg 1.x is not guaranteed.
+
+- MMSeg 1.x uses Runner in [MMEngine](https://github.com/open-mmlab/mmengine) rather than that in MMCV. The new Runner implements and unifies the building logic of dataset, model, evaluation, and visualizer. Therefore, MMSeg 1.x no longer maintains the building logics of those modules in `mmseg.train.apis` and `tools/train.py`. Those code have been migrated into [MMEngine](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py). Please refer to the [migration guide of Runner in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/runner.html) for more details.
+
+- The Runner in MMEngine also supports testing and validation. The testing scripts are also simplified, which has similar logic as that in training scripts to build the runner.
+
+- The execution points of hooks in the new Runner have been enriched to allow more flexible customization. Please refer to the [migration guide of Hook in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/hook.html) for more details.
+
+- Learning rate and momentum scheduling has been migrated from `Hook` to `Parameter Scheduler` in MMEngine. Please refer to the [migration guide of Parameter Scheduler in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/param_scheduler.html) for more details.
+
+#### Configs
+
+- The [Runner in MMEngine](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py) uses a different config structures to ease the understanding of the components in runner. Users can read the [config example of mmseg](../user_guides/config.md) or refer to the [migration guide in MMEngine](https://mmengine.readthedocs.io/en/latest/migration/runner.html) for migration details.
+- The file names of configs and models are also refactored to follow the new rules unified across OpenMMLab 2.0 projects. Please refer to the [user guides of config](../user_guides/1_config.md) for more details.
+
+#### Components
+
+- Dataset
+- Data Transforms
+- Model
+- Evaluation
+- Visualization
+
+### Improvements
+
+- Support mixed precision training of all the models. However, some models may got Nan results due to some numerical issues. We will update the documentation and list their results (accuracy of failure) of mixed precision training.
+
+### Bug Fixes
+
+- Fix several config file errors [#1994](https://github.com/open-mmlab/mmsegmentation/pull/1994)
+
+### New Features
+
+1. Support data structures and encapsulating `seg_logits` in data samples, which can be return from models to support more common evaluation metrics.
+
+### Ongoing changes
+
+1. Test-time augmentation: which is supported in MMSeg 0.x is not implemented in this version due to limited time slot. We will support it in the following releases with a new and simplified design.
+
+2. Inference interfaces: a unified inference interfaces will be supported in the future to ease the use of released models.
+
+3. Interfaces of useful tools that can be used in notebook: more useful tools that implemented in the `tools` directory will have their python interfaces so that they can be used through notebook and in downstream libraries.
+
+4. Documentation: we will add more design docs, tutorials, and migration guidance so that the community can deep dive into our new design, participate the future development, and smoothly migrate downstream libraries to MMSeg 1.x.
diff --git a/docs/en/notes/changelog_v0.x.md b/docs/en/notes/changelog_v0.x.md
new file mode 100644
index 0000000000..d347a444d8
--- /dev/null
+++ b/docs/en/notes/changelog_v0.x.md
@@ -0,0 +1,720 @@
+## Changelog
+
+### V0.24.1 (5/1/2022)
+
+**Bug Fixes**
+
+- Fix `LayerDecayOptimizerConstructor` for MAE training ([#1539](https://github.com/open-mmlab/mmsegmentation/pull/1539), [#1540](https://github.com/open-mmlab/mmsegmentation/pull/1540))
+
+### V0.24.0 (4/29/2022)
+
+**Highlights**
+
+- Support MAE: Masked Autoencoders Are Scalable Vision Learners
+- Support Resnet strikes back
+
+**New Features**
+
+- Support MAE: Masked Autoencoders Are Scalable Vision Learners ([#1307](https://github.com/open-mmlab/mmsegmentation/pull/1307), [#1523](https://github.com/open-mmlab/mmsegmentation/pull/1523))
+- Support Resnet strikes back ([#1390](https://github.com/open-mmlab/mmsegmentation/pull/1390))
+- Support extra dataloader settings in configs ([#1435](https://github.com/open-mmlab/mmsegmentation/pull/1435))
+
+**Bug Fixes**
+
+- Fix input previous results for the last cascade_decode_head ([#1450](https://github.com/open-mmlab/mmsegmentation/pull/1450))
+- Fix validation loss logging ([#1494](https://github.com/open-mmlab/mmsegmentation/pull/1494))
+- Fix the bug in binary_cross_entropy ([1527](https://github.com/open-mmlab/mmsegmentation/pull/1527))
+- Support single channel prediction for Binary Cross Entropy Loss ([#1454](https://github.com/open-mmlab/mmsegmentation/pull/1454))
+- Fix potential bugs in accuracy.py ([1496](https://github.com/open-mmlab/mmsegmentation/pull/1496))
+- Avoid converting label ids twice by label map during evaluation ([1417](https://github.com/open-mmlab/mmsegmentation/pull/1417))
+- Fix bug about label_map ([1445](https://github.com/open-mmlab/mmsegmentation/pull/1445))
+- Fix image save path bug in Windows ([1423](https://github.com/open-mmlab/mmsegmentation/pull/1423))
+- Fix MMSegmentation Colab demo ([1501](https://github.com/open-mmlab/mmsegmentation/pull/1501), [1452](https://github.com/open-mmlab/mmsegmentation/pull/1452))
+- Migrate azure blob for beit checkpoints ([1503](https://github.com/open-mmlab/mmsegmentation/pull/1503))
+- Fix bug in `tools/analyse_logs.py` caused by wrong plot_iter in some cases ([1428](https://github.com/open-mmlab/mmsegmentation/pull/1428))
+
+**Improvements**
+
+- Merge BEiT and ConvNext's LR decay optimizer constructors ([#1438](https://github.com/open-mmlab/mmsegmentation/pull/1438))
+- Register optimizer constructor with mmseg ([#1456](https://github.com/open-mmlab/mmsegmentation/pull/1456))
+- Refactor transformer encode layer in ViT and BEiT backbone ([#1481](https://github.com/open-mmlab/mmsegmentation/pull/1481))
+- Add `build_pos_embed` and `build_layers` for BEiT ([1517](https://github.com/open-mmlab/mmsegmentation/pull/1517))
+- Add `with_cp` to mit and vit ([1431](https://github.com/open-mmlab/mmsegmentation/pull/1431))
+- Fix inconsistent dtype of `seg_label` in stdc decode ([1463](https://github.com/open-mmlab/mmsegmentation/pull/1463))
+- Delete random seed for training in `dist_train.sh` ([1519](https://github.com/open-mmlab/mmsegmentation/pull/1519))
+- Revise high `workers_per_gpus` in config file ([#1506](https://github.com/open-mmlab/mmsegmentation/pull/1506))
+- Add GPG keys and del mmcv version in Dockerfile ([1534](https://github.com/open-mmlab/mmsegmentation/pull/1534))
+- Update checkpoint for model in deeplabv3plus ([#1487](https://github.com/open-mmlab/mmsegmentation/pull/1487))
+- Add `DistSamplerSeedHook` to set epoch number to dataloader when runner is `EpochBasedRunner` ([1449](https://github.com/open-mmlab/mmsegmentation/pull/1449))
+- Provide URLs of Swin Transformer pretrained models ([1389](https://github.com/open-mmlab/mmsegmentation/pull/1389))
+- Updating Dockerfiles From Docker Directory and `get_started.md` to reach latest stable version of Python, PyTorch and MMCV ([1446](https://github.com/open-mmlab/mmsegmentation/pull/1446))
+
+**Documentation**
+
+- Add more clearly statement of CPU training/inference ([1518](https://github.com/open-mmlab/mmsegmentation/pull/1518))
+
+**Contributors**
+
+- @jiangyitong made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1431
+- @kahkeng made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1447
+- @Nourollah made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1446
+- @androbaza made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1452
+- @Yzichen made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1445
+- @whu-pzhang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1423
+- @panfeng-hover made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1417
+- @Johnson-Wang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1496
+- @jere357 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1460
+- @mfernezir made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1494
+- @donglixp made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1503
+- @YuanLiuuuuuu made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1307
+- @Dawn-bin made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1527
+
+### V0.23.0 (4/1/2022)
+
+**Highlights**
+
+- Support BEiT: BERT Pre-Training of Image Transformers
+- Support K-Net: Towards Unified Image Segmentation
+- Add `avg_non_ignore` of CELoss to support average loss over non-ignored elements
+- Support dataset initialization with file client
+
+**New Features**
+
+- Support BEiT: BERT Pre-Training of Image Transformers ([#1404](https://github.com/open-mmlab/mmsegmentation/pull/1404))
+- Support K-Net: Towards Unified Image Segmentation ([#1289](https://github.com/open-mmlab/mmsegmentation/pull/1289))
+- Support dataset initialization with file client ([#1402](https://github.com/open-mmlab/mmsegmentation/pull/1402))
+- Add class name function for STARE datasets ([#1376](https://github.com/open-mmlab/mmsegmentation/pull/1376))
+- Support different seeds on different ranks when distributed training ([#1362](https://github.com/open-mmlab/mmsegmentation/pull/1362))
+- Add `nlc2nchw2nlc` and `nchw2nlc2nchw` to simplify tensor with different dimension operation ([#1249](https://github.com/open-mmlab/mmsegmentation/pull/1249))
+
+**Improvements**
+
+- Synchronize random seed for distributed sampler ([#1411](https://github.com/open-mmlab/mmsegmentation/pull/1411))
+- Add script and documentation for multi-machine distributed training ([#1383](https://github.com/open-mmlab/mmsegmentation/pull/1383))
+
+**Bug Fixes**
+
+- Add `avg_non_ignore` of CELoss to support average loss over non-ignored elements ([#1409](https://github.com/open-mmlab/mmsegmentation/pull/1409))
+- Fix some wrong URLs of models or logs in `./configs` ([#1336](https://github.com/open-mmlab/mmsegmentation/pull/1433))
+- Add title and color theme arguments to plot function in `tools/confusion_matrix.py` ([#1401](https://github.com/open-mmlab/mmsegmentation/pull/1401))
+- Fix outdated link in Colab demo ([#1392](https://github.com/open-mmlab/mmsegmentation/pull/1392))
+- Fix typos ([#1424](https://github.com/open-mmlab/mmsegmentation/pull/1424), [#1405](https://github.com/open-mmlab/mmsegmentation/pull/1405), [#1371](https://github.com/open-mmlab/mmsegmentation/pull/1371), [#1366](https://github.com/open-mmlab/mmsegmentation/pull/1366), [#1363](https://github.com/open-mmlab/mmsegmentation/pull/1363))
+
+**Documentation**
+
+- Add FAQ document ([#1420](https://github.com/open-mmlab/mmsegmentation/pull/1420))
+- Fix the config name style description in official docs([#1414](https://github.com/open-mmlab/mmsegmentation/pull/1414))
+
+**Contributors**
+
+- @kinglintianxia made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1371
+- @CCODING04 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1376
+- @mob5566 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1401
+- @xiongnemo made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1392
+- @Xiangxu-0103 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1405
+
+### V0.22.1 (3/9/2022)
+
+**Bug Fixes**
+
+- Fix the ZeroDivisionError that all pixels in one image is ignored. ([#1336](https://github.com/open-mmlab/mmsegmentation/pull/1336))
+
+**Improvements**
+
+- Provide URLs of STDC, Segmenter and Twins pretrained models ([#1272](https://github.com/open-mmlab/mmsegmentation/pull/1357))
+
+### V0.22 (3/04/2022)
+
+**Highlights**
+
+- Support ConvNeXt: A ConvNet for the 2020s. Please use the latest MMClassification (0.21.0) to try it out.
+- Support iSAID aerial Dataset.
+- Officially Support inference on Windows OS.
+
+**New Features**
+
+- Support ConvNeXt: A ConvNet for the 2020s. ([#1216](https://github.com/open-mmlab/mmsegmentation/pull/1216))
+- Support iSAID aerial Dataset. ([#1115](https://github.com/open-mmlab/mmsegmentation/pull/1115)
+- Generating and plotting confusion matrix. ([#1301](https://github.com/open-mmlab/mmsegmentation/pull/1301))
+
+**Improvements**
+
+- Refactor 4 decoder heads (ASPP, FCN, PSP, UPer): Split forward function into `_forward_feature` and `cls_seg`. ([#1299](https://github.com/open-mmlab/mmsegmentation/pull/1299))
+- Add `min_size` arg in `Resize` to keep the shape after resize bigger than slide window. ([#1318](https://github.com/open-mmlab/mmsegmentation/pull/1318))
+- Revise pre-commit-hooks. ([#1315](https://github.com/open-mmlab/mmsegmentation/pull/1315))
+- Add win-ci. ([#1296](https://github.com/open-mmlab/mmsegmentation/pull/1296))
+
+**Bug Fixes**
+
+- Fix `mlp_ratio` type in Swin Transformer. ([#1274](https://github.com/open-mmlab/mmsegmentation/pull/1274))
+- Fix path errors in `./demo` . ([#1269](https://github.com/open-mmlab/mmsegmentation/pull/1269))
+- Fix bug in conversion of potsdam. ([#1279](https://github.com/open-mmlab/mmsegmentation/pull/1279))
+- Make accuracy take into account `ignore_index`. ([#1259](https://github.com/open-mmlab/mmsegmentation/pull/1259))
+- Add Pytorch HardSwish assertion in unit test. ([#1294](https://github.com/open-mmlab/mmsegmentation/pull/1294))
+- Fix wrong palette value in vaihingen. ([#1292](https://github.com/open-mmlab/mmsegmentation/pull/1292))
+- Fix the bug that SETR cannot load pretrain. ([#1293](https://github.com/open-mmlab/mmsegmentation/pull/1293))
+- Update correct `In Collection` in metafile of each configs. ([#1239](https://github.com/open-mmlab/mmsegmentation/pull/1239))
+- Upload completed STDC models. ([#1332](https://github.com/open-mmlab/mmsegmentation/pull/1332))
+- Fix `DNLHead` exports onnx inference difference type Cast error. ([#1161](https://github.com/open-mmlab/mmsegmentation/pull/1332))
+
+**Contributors**
+
+- @JiaYanhao made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1269
+- @andife made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1281
+- @SBCV made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1279
+- @HJoonKwon made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1259
+- @Tsingularity made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1290
+- @Waterman0524 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1115
+- @MeowZheng made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1315
+- @linfangjian01 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1318
+
+### V0.21.1 (2/9/2022)
+
+**Bug Fixes**
+
+- Fix typos in docs. ([#1263](https://github.com/open-mmlab/mmsegmentation/pull/1263))
+- Fix repeating log by `setup_multi_processes`. ([#1267](https://github.com/open-mmlab/mmsegmentation/pull/1267))
+- Upgrade isort in pre-commit hook. ([#1270](https://github.com/open-mmlab/mmsegmentation/pull/1270))
+
+**Improvements**
+
+- Use MMCV load_state_dict func in ViT/Swin. ([#1272](https://github.com/open-mmlab/mmsegmentation/pull/1272))
+- Add exception for PointRend for support CPU-only. ([#1271](https://github.com/open-mmlab/mmsegmentation/pull/1270))
+
+### V0.21 (1/29/2022)
+
+**Highlights**
+
+- Officially Support CPUs training and inference, please use the latest MMCV (1.4.4) to try it out.
+- Support Segmenter: Transformer for Semantic Segmentation (ICCV'2021).
+- Support ISPRS Potsdam and Vaihingen Dataset.
+- Add Mosaic transform and `MultiImageMixDataset` class in `dataset_wrappers`.
+
+**New Features**
+
+- Support Segmenter: Transformer for Semantic Segmentation (ICCV'2021) ([#955](https://github.com/open-mmlab/mmsegmentation/pull/955))
+- Support ISPRS Potsdam and Vaihingen Dataset ([#1097](https://github.com/open-mmlab/mmsegmentation/pull/1097), [#1171](https://github.com/open-mmlab/mmsegmentation/pull/1171))
+- Add segformer‘s benchmark on cityscapes ([#1155](https://github.com/open-mmlab/mmsegmentation/pull/1155))
+- Add auto resume ([#1172](https://github.com/open-mmlab/mmsegmentation/pull/1172))
+- Add Mosaic transform and `MultiImageMixDataset` class in `dataset_wrappers` ([#1093](https://github.com/open-mmlab/mmsegmentation/pull/1093), [#1105](https://github.com/open-mmlab/mmsegmentation/pull/1105))
+- Add log collector ([#1175](https://github.com/open-mmlab/mmsegmentation/pull/1175))
+
+**Improvements**
+
+- New-style CPU training and inference ([#1251](https://github.com/open-mmlab/mmsegmentation/pull/1251))
+- Add UNet benchmark with multiple losses supervision ([#1143](https://github.com/open-mmlab/mmsegmentation/pull/1143))
+
+**Bug Fixes**
+
+- Fix the model statistics in doc for readthedoc ([#1153](https://github.com/open-mmlab/mmsegmentation/pull/1153))
+- Set random seed for `palette` if not given ([#1152](https://github.com/open-mmlab/mmsegmentation/pull/1152))
+- Add `COCOStuffDataset` in `class_names.py` ([#1222](https://github.com/open-mmlab/mmsegmentation/pull/1222))
+- Fix bug in non-distributed multi-gpu training/testing ([#1247](https://github.com/open-mmlab/mmsegmentation/pull/1247))
+- Delete unnecessary lines of STDCHead ([#1231](https://github.com/open-mmlab/mmsegmentation/pull/1231))
+
+**Contributors**
+
+- @jbwang1997 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1152
+- @BeaverCC made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1206
+- @Echo-minn made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1214
+- @rstrudel made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/955
+
+### V0.20.2 (12/15/2021)
+
+**Bug Fixes**
+
+- Revise --option to --options to avoid BC-breaking. ([#1140](https://github.com/open-mmlab/mmsegmentation/pull/1140))
+
+### V0.20.1 (12/14/2021)
+
+**Improvements**
+
+- Change options to cfg-options ([#1129](https://github.com/open-mmlab/mmsegmentation/pull/1129))
+
+**Bug Fixes**
+
+- Fix `<!-- [ABSTRACT] -->` in metafile. ([#1127](https://github.com/open-mmlab/mmsegmentation/pull/1127))
+- Fix correct `num_classes` of HRNet in `LoveDA` dataset ([#1136](https://github.com/open-mmlab/mmsegmentation/pull/1136))
+
+### V0.20 (12/10/2021)
+
+**Highlights**
+
+- Support Twins ([#989](https://github.com/open-mmlab/mmsegmentation/pull/989))
+- Support a real-time segmentation model STDC ([#995](https://github.com/open-mmlab/mmsegmentation/pull/995))
+- Support a widely-used segmentation model in lane detection ERFNet ([#960](https://github.com/open-mmlab/mmsegmentation/pull/960))
+- Support A Remote Sensing Land-Cover Dataset LoveDA ([#1028](https://github.com/open-mmlab/mmsegmentation/pull/1028))
+- Support focal loss ([#1024](https://github.com/open-mmlab/mmsegmentation/pull/1024))
+
+**New Features**
+
+- Support Twins ([#989](https://github.com/open-mmlab/mmsegmentation/pull/989))
+- Support a real-time segmentation model STDC ([#995](https://github.com/open-mmlab/mmsegmentation/pull/995))
+- Support a widely-used segmentation model in lane detection ERFNet ([#960](https://github.com/open-mmlab/mmsegmentation/pull/960))
+- Add SETR cityscapes benchmark ([#1087](https://github.com/open-mmlab/mmsegmentation/pull/1087))
+- Add BiSeNetV1 COCO-Stuff 164k benchmark ([#1019](https://github.com/open-mmlab/mmsegmentation/pull/1019))
+- Support focal loss ([#1024](https://github.com/open-mmlab/mmsegmentation/pull/1024))
+- Add Cutout transform ([#1022](https://github.com/open-mmlab/mmsegmentation/pull/1022))
+
+**Improvements**
+
+- Set a random seed when the user does not set a seed ([#1039](https://github.com/open-mmlab/mmsegmentation/pull/1039))
+- Add CircleCI setup ([#1086](https://github.com/open-mmlab/mmsegmentation/pull/1086))
+- Skip CI on ignoring given paths ([#1078](https://github.com/open-mmlab/mmsegmentation/pull/1078))
+- Add abstract and image for every paper ([#1060](https://github.com/open-mmlab/mmsegmentation/pull/1060))
+- Create a symbolic link on windows ([#1090](https://github.com/open-mmlab/mmsegmentation/pull/1090))
+- Support video demo using trained model ([#1014](https://github.com/open-mmlab/mmsegmentation/pull/1014))
+
+**Bug Fixes**
+
+- Fix incorrectly loading init_cfg or pretrained models of several transformer models ([#999](https://github.com/open-mmlab/mmsegmentation/pull/999), [#1069](https://github.com/open-mmlab/mmsegmentation/pull/1069), [#1102](https://github.com/open-mmlab/mmsegmentation/pull/1102))
+- Fix EfficientMultiheadAttention in SegFormer ([#1037](https://github.com/open-mmlab/mmsegmentation/pull/1037))
+- Remove `fp16` folder in `configs` ([#1031](https://github.com/open-mmlab/mmsegmentation/pull/1031))
+- Fix several typos in .yml file (Dice Metric [#1041](https://github.com/open-mmlab/mmsegmentation/pull/1041), ADE20K dataset [#1120](https://github.com/open-mmlab/mmsegmentation/pull/1120), Training Memory (GB) [#1083](https://github.com/open-mmlab/mmsegmentation/pull/1083))
+- Fix test error when using `--show-dir` ([#1091](https://github.com/open-mmlab/mmsegmentation/pull/1091))
+- Fix dist training infinite waiting issue ([#1035](https://github.com/open-mmlab/mmsegmentation/pull/1035))
+- Change the upper version of mmcv to 1.5.0 ([#1096](https://github.com/open-mmlab/mmsegmentation/pull/1096))
+- Fix symlink failure on Windows ([#1038](https://github.com/open-mmlab/mmsegmentation/pull/1038))
+- Cancel previous runs that are not completed ([#1118](https://github.com/open-mmlab/mmsegmentation/pull/1118))
+- Unified links of readthedocs in docs ([#1119](https://github.com/open-mmlab/mmsegmentation/pull/1119))
+
+**Contributors**
+
+- @Junjue-Wang made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1028
+- @ddebby made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1066
+- @del-zhenwu made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1078
+- @KangBK0120 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1106
+- @zergzzlun made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1091
+- @fingertap made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1035
+- @irvingzhang0512 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1014
+- @littleSunlxy made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/989
+- @lkm2835
+- @RockeyCoss
+- @MengzhangLI
+- @Junjun2016
+- @xiexinch
+- @xvjiarui
+
+### V0.19 (11/02/2021)
+
+**Highlights**
+
+- Support TIMMBackbone wrapper ([#998](https://github.com/open-mmlab/mmsegmentation/pull/998))
+- Support custom hook ([#428](https://github.com/open-mmlab/mmsegmentation/pull/428))
+- Add codespell pre-commit hook ([#920](https://github.com/open-mmlab/mmsegmentation/pull/920))
+- Add FastFCN benchmark on ADE20K ([#972](https://github.com/open-mmlab/mmsegmentation/pull/972))
+
+**New Features**
+
+- Support TIMMBackbone wrapper ([#998](https://github.com/open-mmlab/mmsegmentation/pull/998))
+- Support custom hook ([#428](https://github.com/open-mmlab/mmsegmentation/pull/428))
+- Add FastFCN benchmark on ADE20K ([#972](https://github.com/open-mmlab/mmsegmentation/pull/972))
+- Add codespell pre-commit hook and fix typos ([#920](https://github.com/open-mmlab/mmsegmentation/pull/920))
+
+**Improvements**
+
+- Make inputs & channels smaller in unittests ([#1004](https://github.com/open-mmlab/mmsegmentation/pull/1004))
+- Change `self.loss_decode` back to `dict` in Single Loss situation ([#1002](https://github.com/open-mmlab/mmsegmentation/pull/1002))
+
+**Bug Fixes**
+
+- Fix typo in usage example ([#1003](https://github.com/open-mmlab/mmsegmentation/pull/1003))
+- Add contiguous after permutation in ViT ([#992](https://github.com/open-mmlab/mmsegmentation/pull/992))
+- Fix the invalid link ([#985](https://github.com/open-mmlab/mmsegmentation/pull/985))
+- Fix bug in CI with python 3.9 ([#994](https://github.com/open-mmlab/mmsegmentation/pull/994))
+- Fix bug when loading class name form file in custom dataset ([#923](https://github.com/open-mmlab/mmsegmentation/pull/923))
+
+**Contributors**
+
+- @ShoupingShan made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/923
+- @RockeyCoss made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/954
+- @HarborYuan made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/992
+- @lkm2835 made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/1003
+- @gszh made their first contribution in https://github.com/open-mmlab/mmsegmentation/pull/428
+- @VVsssssk
+- @MengzhangLI
+- @Junjun2016
+
+### V0.18 (10/07/2021)
+
+**Highlights**
+
+- Support three real-time segmentation models (ICNet [#884](https://github.com/open-mmlab/mmsegmentation/pull/884), BiSeNetV1 [#851](https://github.com/open-mmlab/mmsegmentation/pull/851), and BiSeNetV2 [#804](https://github.com/open-mmlab/mmsegmentation/pull/804))
+- Support one efficient segmentation model (FastFCN [#885](https://github.com/open-mmlab/mmsegmentation/pull/885))
+- Support one efficient non-local/self-attention based segmentation model (ISANet [#70](https://github.com/open-mmlab/mmsegmentation/pull/70))
+- Support COCO-Stuff 10k and 164k datasets ([#625](https://github.com/open-mmlab/mmsegmentation/pull/625))
+- Support evaluate concated dataset separately ([#833](https://github.com/open-mmlab/mmsegmentation/pull/833))
+- Support loading GT for evaluation from multi-file backend ([#867](https://github.com/open-mmlab/mmsegmentation/pull/867))
+
+**New Features**
+
+- Support three real-time segmentation models (ICNet [#884](https://github.com/open-mmlab/mmsegmentation/pull/884), BiSeNetV1 [#851](https://github.com/open-mmlab/mmsegmentation/pull/851), and BiSeNetV2 [#804](https://github.com/open-mmlab/mmsegmentation/pull/804))
+- Support one efficient segmentation model (FastFCN [#885](https://github.com/open-mmlab/mmsegmentation/pull/885))
+- Support one efficient non-local/self-attention based segmentation model (ISANet [#70](https://github.com/open-mmlab/mmsegmentation/pull/70))
+- Support COCO-Stuff 10k and 164k datasets ([#625](https://github.com/open-mmlab/mmsegmentation/pull/625))
+- Support evaluate concated dataset separately ([#833](https://github.com/open-mmlab/mmsegmentation/pull/833))
+
+**Improvements**
+
+- Support loading GT for evaluation from multi-file backend ([#867](https://github.com/open-mmlab/mmsegmentation/pull/867))
+- Auto-convert SyncBN to BN when training on DP automatly([#772](https://github.com/open-mmlab/mmsegmentation/pull/772))
+- Refactor Swin-Transformer ([#800](https://github.com/open-mmlab/mmsegmentation/pull/800))
+
+**Bug Fixes**
+
+- Update mmcv installation in dockerfile ([#860](https://github.com/open-mmlab/mmsegmentation/pull/860))
+- Fix number of iteration bug when resuming checkpoint in distributed train ([#866](https://github.com/open-mmlab/mmsegmentation/pull/866))
+- Fix parsing parse in val_step ([#906](https://github.com/open-mmlab/mmsegmentation/pull/906))
+
+### V0.17 (09/01/2021)
+
+**Highlights**
+
+- Support SegFormer
+- Support DPT
+- Support Dark Zurich and Nighttime Driving datasets
+- Support progressive evaluation
+
+**New Features**
+
+- Support SegFormer ([#599](https://github.com/open-mmlab/mmsegmentation/pull/599))
+- Support DPT ([#605](https://github.com/open-mmlab/mmsegmentation/pull/605))
+- Support Dark Zurich and Nighttime Driving datasets ([#815](https://github.com/open-mmlab/mmsegmentation/pull/815))
+- Support progressive evaluation ([#709](https://github.com/open-mmlab/mmsegmentation/pull/709))
+
+**Improvements**
+
+- Add multiscale_output interface and unittests for HRNet ([#830](https://github.com/open-mmlab/mmsegmentation/pull/830))
+- Support inherit cityscapes dataset ([#750](https://github.com/open-mmlab/mmsegmentation/pull/750))
+- Fix some typos in README.md ([#824](https://github.com/open-mmlab/mmsegmentation/pull/824))
+- Delete convert function and add instruction to ViT/Swin README.md ([#791](https://github.com/open-mmlab/mmsegmentation/pull/791))
+- Add vit/swin/mit convert weight scripts ([#783](https://github.com/open-mmlab/mmsegmentation/pull/783))
+- Add copyright files ([#796](https://github.com/open-mmlab/mmsegmentation/pull/796))
+
+**Bug Fixes**
+
+- Fix invalid checkpoint link in inference_demo.ipynb ([#814](https://github.com/open-mmlab/mmsegmentation/pull/814))
+- Ensure that items in dataset have the same order across multi machine ([#780](https://github.com/open-mmlab/mmsegmentation/pull/780))
+- Fix the log error ([#766](https://github.com/open-mmlab/mmsegmentation/pull/766))
+
+### V0.16 (08/04/2021)
+
+**Highlights**
+
+- Support PyTorch 1.9
+- Support SegFormer backbone MiT
+- Support md2yml pre-commit hook
+- Support frozen stage for HRNet
+
+**New Features**
+
+- Support SegFormer backbone MiT ([#594](https://github.com/open-mmlab/mmsegmentation/pull/594))
+- Support md2yml pre-commit hook ([#732](https://github.com/open-mmlab/mmsegmentation/pull/732))
+- Support mim ([#717](https://github.com/open-mmlab/mmsegmentation/pull/717))
+- Add mmseg2torchserve tool ([#552](https://github.com/open-mmlab/mmsegmentation/pull/552))
+
+**Improvements**
+
+- Support hrnet frozen stage ([#743](https://github.com/open-mmlab/mmsegmentation/pull/743))
+- Add template of reimplementation questions ([#741](https://github.com/open-mmlab/mmsegmentation/pull/741))
+- Output pdf and epub formats for readthedocs ([#742](https://github.com/open-mmlab/mmsegmentation/pull/742))
+- Refine the docstring of ResNet ([#723](https://github.com/open-mmlab/mmsegmentation/pull/723))
+- Replace interpolate with resize ([#731](https://github.com/open-mmlab/mmsegmentation/pull/731))
+- Update resource limit ([#700](https://github.com/open-mmlab/mmsegmentation/pull/700))
+- Update config.md ([#678](https://github.com/open-mmlab/mmsegmentation/pull/678))
+
+**Bug Fixes**
+
+- Fix ATTENTION registry ([#729](https://github.com/open-mmlab/mmsegmentation/pull/729))
+- Fix analyze log script ([#716](https://github.com/open-mmlab/mmsegmentation/pull/716))
+- Fix doc api display ([#725](https://github.com/open-mmlab/mmsegmentation/pull/725))
+- Fix patch_embed and pos_embed mismatch error ([#685](https://github.com/open-mmlab/mmsegmentation/pull/685))
+- Fix efficient test for multi-node ([#707](https://github.com/open-mmlab/mmsegmentation/pull/707))
+- Fix init_cfg in resnet backbone ([#697](https://github.com/open-mmlab/mmsegmentation/pull/697))
+- Fix efficient test bug ([#702](https://github.com/open-mmlab/mmsegmentation/pull/702))
+- Fix url error in config docs ([#680](https://github.com/open-mmlab/mmsegmentation/pull/680))
+- Fix mmcv installation ([#676](https://github.com/open-mmlab/mmsegmentation/pull/676))
+- Fix torch version ([#670](https://github.com/open-mmlab/mmsegmentation/pull/670))
+
+**Contributors**
+
+@sshuair @xiexinch @Junjun2016 @mmeendez8 @xvjiarui @sennnnn @puhsu @BIGWangYuDong @keke1u @daavoo
+
+### V0.15 (07/04/2021)
+
+**Highlights**
+
+- Support ViT, SETR, and Swin-Transformer
+- Add Chinese documentation
+- Unified parameter initialization
+
+**Bug Fixes**
+
+- Fix typo and links ([#608](https://github.com/open-mmlab/mmsegmentation/pull/608))
+- Fix Dockerfile ([#607](https://github.com/open-mmlab/mmsegmentation/pull/607))
+- Fix ViT init ([#609](https://github.com/open-mmlab/mmsegmentation/pull/609))
+- Fix mmcv version compatible table ([#658](https://github.com/open-mmlab/mmsegmentation/pull/658))
+- Fix model links of DMNEt ([#660](https://github.com/open-mmlab/mmsegmentation/pull/660))
+
+**New Features**
+
+- Support loading DeiT weights ([#538](https://github.com/open-mmlab/mmsegmentation/pull/538))
+- Support SETR ([#531](https://github.com/open-mmlab/mmsegmentation/pull/531), [#635](https://github.com/open-mmlab/mmsegmentation/pull/635))
+- Add config and models for ViT backbone with UperHead ([#520](https://github.com/open-mmlab/mmsegmentation/pull/531), [#635](https://github.com/open-mmlab/mmsegmentation/pull/520))
+- Support Swin-Transformer ([#511](https://github.com/open-mmlab/mmsegmentation/pull/511))
+- Add higher accuracy FastSCNN ([#606](https://github.com/open-mmlab/mmsegmentation/pull/606))
+- Add Chinese documentation ([#666](https://github.com/open-mmlab/mmsegmentation/pull/666))
+
+**Improvements**
+
+- Unified parameter initialization ([#567](https://github.com/open-mmlab/mmsegmentation/pull/567))
+- Separate CUDA and CPU in  github action CI ([#602](https://github.com/open-mmlab/mmsegmentation/pull/602))
+- Support persistent dataloader worker ([#646](https://github.com/open-mmlab/mmsegmentation/pull/646))
+- Update meta file fields ([#661](https://github.com/open-mmlab/mmsegmentation/pull/661), [#664](https://github.com/open-mmlab/mmsegmentation/pull/664))
+
+### V0.14 (06/02/2021)
+
+**Highlights**
+
+- Support ONNX to TensorRT
+- Support MIM
+
+**Bug Fixes**
+
+- Fix ONNX to TensorRT verify ([#547](https://github.com/open-mmlab/mmsegmentation/pull/547))
+- Fix save best for EvalHook ([#575](https://github.com/open-mmlab/mmsegmentation/pull/575))
+
+**New Features**
+
+- Support loading DeiT weights ([#538](https://github.com/open-mmlab/mmsegmentation/pull/538))
+- Support ONNX to TensorRT ([#542](https://github.com/open-mmlab/mmsegmentation/pull/542))
+- Support output results for ADE20k ([#544](https://github.com/open-mmlab/mmsegmentation/pull/544))
+- Support MIM ([#549](https://github.com/open-mmlab/mmsegmentation/pull/549))
+
+**Improvements**
+
+- Add option for ViT output shape ([#530](https://github.com/open-mmlab/mmsegmentation/pull/530))
+- Infer batch size using len(result) ([#532](https://github.com/open-mmlab/mmsegmentation/pull/532))
+- Add compatible table between MMSeg and MMCV ([#558](https://github.com/open-mmlab/mmsegmentation/pull/558))
+
+### V0.13 (05/05/2021)
+
+**Highlights**
+
+- Support Pascal Context Class-59 dataset.
+- Support Visual Transformer Backbone.
+- Support mFscore metric.
+
+**Bug Fixes**
+
+- Fixed Colaboratory tutorial ([#451](https://github.com/open-mmlab/mmsegmentation/pull/451))
+- Fixed mIoU calculation range ([#471](https://github.com/open-mmlab/mmsegmentation/pull/471))
+- Fixed sem_fpn, unet README.md ([#492](https://github.com/open-mmlab/mmsegmentation/pull/492))
+- Fixed `num_classes` in FCN for Pascal Context 60-class dataset ([#488](https://github.com/open-mmlab/mmsegmentation/pull/488))
+- Fixed FP16 inference ([#497](https://github.com/open-mmlab/mmsegmentation/pull/497))
+
+**New Features**
+
+- Support dynamic export and visualize to pytorch2onnx ([#463](https://github.com/open-mmlab/mmsegmentation/pull/463))
+- Support export to torchscript ([#469](https://github.com/open-mmlab/mmsegmentation/pull/469), [#499](https://github.com/open-mmlab/mmsegmentation/pull/499))
+- Support Pascal Context Class-59 dataset ([#459](https://github.com/open-mmlab/mmsegmentation/pull/459))
+- Support Visual Transformer backbone ([#465](https://github.com/open-mmlab/mmsegmentation/pull/465))
+- Support UpSample Neck ([#512](https://github.com/open-mmlab/mmsegmentation/pull/512))
+- Support mFscore metric ([#509](https://github.com/open-mmlab/mmsegmentation/pull/509))
+
+**Improvements**
+
+- Add more CI for PyTorch ([#460](https://github.com/open-mmlab/mmsegmentation/pull/460))
+- Add print model graph args for tools/print_config.py ([#451](https://github.com/open-mmlab/mmsegmentation/pull/451))
+- Add cfg links in modelzoo README.md ([#468](https://github.com/open-mmlab/mmsegmentation/pull/469))
+- Add BaseSegmentor import to segmentors/__init__.py ([#495](https://github.com/open-mmlab/mmsegmentation/pull/495))
+- Add MMOCR, MMGeneration links ([#501](https://github.com/open-mmlab/mmsegmentation/pull/501), [#506](https://github.com/open-mmlab/mmsegmentation/pull/506))
+- Add Chinese QR code ([#506](https://github.com/open-mmlab/mmsegmentation/pull/506))
+- Use MMCV MODEL_REGISTRY ([#515](https://github.com/open-mmlab/mmsegmentation/pull/515))
+- Add ONNX testing tools ([#498](https://github.com/open-mmlab/mmsegmentation/pull/498))
+- Replace data_dict calling 'img' key to support MMDet3D ([#514](https://github.com/open-mmlab/mmsegmentation/pull/514))
+- Support reading class_weight from file in loss function ([#513](https://github.com/open-mmlab/mmsegmentation/pull/513))
+- Make tags as comment ([#505](https://github.com/open-mmlab/mmsegmentation/pull/505))
+- Use MMCV EvalHook ([#438](https://github.com/open-mmlab/mmsegmentation/pull/438))
+
+### V0.12 (04/03/2021)
+
+**Highlights**
+
+- Support FCN-Dilate 6 model.
+- Support Dice Loss.
+
+**Bug Fixes**
+
+- Fixed PhotoMetricDistortion Doc ([#388](https://github.com/open-mmlab/mmsegmentation/pull/388))
+- Fixed install scripts ([#399](https://github.com/open-mmlab/mmsegmentation/pull/399))
+- Fixed Dice Loss multi-class ([#417](https://github.com/open-mmlab/mmsegmentation/pull/417))
+
+**New Features**
+
+- Support Dice Loss ([#396](https://github.com/open-mmlab/mmsegmentation/pull/396))
+- Add plot logs tool ([#426](https://github.com/open-mmlab/mmsegmentation/pull/426))
+- Add opacity option to show_result ([#425](https://github.com/open-mmlab/mmsegmentation/pull/425))
+- Speed up mIoU metric ([#430](https://github.com/open-mmlab/mmsegmentation/pull/430))
+
+**Improvements**
+
+- Refactor unittest file structure ([#440](https://github.com/open-mmlab/mmsegmentation/pull/440))
+- Fix typos in the repo ([#449](https://github.com/open-mmlab/mmsegmentation/pull/449))
+- Include class-level metrics in the log ([#445](https://github.com/open-mmlab/mmsegmentation/pull/445))
+
+### V0.11 (02/02/2021)
+
+**Highlights**
+
+- Support memory efficient test, add more UNet models.
+
+**Bug Fixes**
+
+- Fixed TTA resize scale ([#334](https://github.com/open-mmlab/mmsegmentation/pull/334))
+- Fixed CI for pip 20.3 ([#307](https://github.com/open-mmlab/mmsegmentation/pull/307))
+- Fixed ADE20k test ([#359](https://github.com/open-mmlab/mmsegmentation/pull/359))
+
+**New Features**
+
+- Support memory efficient test ([#330](https://github.com/open-mmlab/mmsegmentation/pull/330))
+- Add more UNet benchmarks ([#324](https://github.com/open-mmlab/mmsegmentation/pull/324))
+- Support Lovasz Loss ([#351](https://github.com/open-mmlab/mmsegmentation/pull/351))
+
+**Improvements**
+
+- Move train_cfg/test_cfg inside model ([#341](https://github.com/open-mmlab/mmsegmentation/pull/341))
+
+### V0.10 (01/01/2021)
+
+**Highlights**
+
+- Support MobileNetV3, DMNet, APCNet. Add models of ResNet18V1b, ResNet18V1c, ResNet50V1b.
+
+**Bug Fixes**
+
+- Fixed CPU TTA ([#276](https://github.com/open-mmlab/mmsegmentation/pull/276))
+- Fixed CI for pip 20.3 ([#307](https://github.com/open-mmlab/mmsegmentation/pull/307))
+
+**New Features**
+
+- Add ResNet18V1b, ResNet18V1c, ResNet50V1b, ResNet101V1b models ([#316](https://github.com/open-mmlab/mmsegmentation/pull/316))
+- Support MobileNetV3 ([#268](https://github.com/open-mmlab/mmsegmentation/pull/268))
+- Add 4 retinal vessel segmentation benchmark  ([#315](https://github.com/open-mmlab/mmsegmentation/pull/315))
+- Support DMNet ([#313](https://github.com/open-mmlab/mmsegmentation/pull/313))
+- Support APCNet ([#299](https://github.com/open-mmlab/mmsegmentation/pull/299))
+
+**Improvements**
+
+- Refactor Documentation page ([#311](https://github.com/open-mmlab/mmsegmentation/pull/311))
+- Support resize data augmentation according to original image size ([#291](https://github.com/open-mmlab/mmsegmentation/pull/291))
+
+### V0.9 (30/11/2020)
+
+**Highlights**
+
+- Support 4 medical dataset, UNet and CGNet.
+
+**New Features**
+
+- Support RandomRotate transform ([#215](https://github.com/open-mmlab/mmsegmentation/pull/215), [#260](https://github.com/open-mmlab/mmsegmentation/pull/260))
+- Support RGB2Gray transform ([#227](https://github.com/open-mmlab/mmsegmentation/pull/227))
+- Support Rerange transform ([#228](https://github.com/open-mmlab/mmsegmentation/pull/228))
+- Support ignore_index for BCE loss ([#210](https://github.com/open-mmlab/mmsegmentation/pull/210))
+- Add modelzoo statistics ([#263](https://github.com/open-mmlab/mmsegmentation/pull/263))
+- Support Dice evaluation metric ([#225](https://github.com/open-mmlab/mmsegmentation/pull/225))
+- Support Adjust Gamma transform ([#232](https://github.com/open-mmlab/mmsegmentation/pull/232))
+- Support CLAHE transform ([#229](https://github.com/open-mmlab/mmsegmentation/pull/229))
+
+**Bug Fixes**
+
+- Fixed detail API link ([#267](https://github.com/open-mmlab/mmsegmentation/pull/267))
+
+### V0.8 (03/11/2020)
+
+**Highlights**
+
+- Support 4 medical dataset, UNet and CGNet.
+
+**New Features**
+
+- Support customize runner ([#118](https://github.com/open-mmlab/mmsegmentation/pull/118))
+- Support UNet ([#161](https://github.com/open-mmlab/mmsegmentation/pull/162))
+- Support CHASE_DB1, DRIVE, STARE, HRD ([#203](https://github.com/open-mmlab/mmsegmentation/pull/203))
+- Support CGNet ([#223](https://github.com/open-mmlab/mmsegmentation/pull/223))
+
+### V0.7 (07/10/2020)
+
+**Highlights**
+
+- Support Pascal Context dataset and customizing class dataset.
+
+**Bug Fixes**
+
+- Fixed CPU inference ([#153](https://github.com/open-mmlab/mmsegmentation/pull/153))
+
+**New Features**
+
+- Add DeepLab OS16 models ([#154](https://github.com/open-mmlab/mmsegmentation/pull/154))
+- Support Pascal Context dataset ([#133](https://github.com/open-mmlab/mmsegmentation/pull/133))
+- Support customizing dataset classes ([#71](https://github.com/open-mmlab/mmsegmentation/pull/71))
+- Support customizing dataset palette ([#157](https://github.com/open-mmlab/mmsegmentation/pull/157))
+
+**Improvements**
+
+- Support 4D tensor output in ONNX ([#150](https://github.com/open-mmlab/mmsegmentation/pull/150))
+- Remove redundancies in ONNX export ([#160](https://github.com/open-mmlab/mmsegmentation/pull/160))
+- Migrate to MMCV DepthwiseSeparableConv ([#158](https://github.com/open-mmlab/mmsegmentation/pull/158))
+- Migrate to MMCV collect_env ([#137](https://github.com/open-mmlab/mmsegmentation/pull/137))
+- Use img_prefix and seg_prefix for loading ([#153](https://github.com/open-mmlab/mmsegmentation/pull/153))
+
+### V0.6 (10/09/2020)
+
+**Highlights**
+
+- Support new methods i.e. MobileNetV2, EMANet, DNL, PointRend, Semantic FPN, Fast-SCNN, ResNeSt.
+
+**Bug Fixes**
+
+- Fixed sliding inference ONNX export ([#90](https://github.com/open-mmlab/mmsegmentation/pull/90))
+
+**New Features**
+
+- Support MobileNet v2 ([#86](https://github.com/open-mmlab/mmsegmentation/pull/86))
+- Support EMANet ([#34](https://github.com/open-mmlab/mmsegmentation/pull/34))
+- Support DNL ([#37](https://github.com/open-mmlab/mmsegmentation/pull/37))
+- Support PointRend ([#109](https://github.com/open-mmlab/mmsegmentation/pull/109))
+- Support Semantic FPN ([#94](https://github.com/open-mmlab/mmsegmentation/pull/94))
+- Support Fast-SCNN ([#58](https://github.com/open-mmlab/mmsegmentation/pull/58))
+- Support ResNeSt backbone ([#47](https://github.com/open-mmlab/mmsegmentation/pull/47))
+- Support ONNX export (experimental) ([#12](https://github.com/open-mmlab/mmsegmentation/pull/12))
+
+**Improvements**
+
+- Support Upsample in ONNX ([#100](https://github.com/open-mmlab/mmsegmentation/pull/100))
+- Support Windows install (experimental) ([#75](https://github.com/open-mmlab/mmsegmentation/pull/75))
+- Add more OCRNet results ([#20](https://github.com/open-mmlab/mmsegmentation/pull/20))
+- Add PyTorch 1.6 CI ([#64](https://github.com/open-mmlab/mmsegmentation/pull/64))
+- Get version and githash automatically ([#55](https://github.com/open-mmlab/mmsegmentation/pull/55))
+
+### v0.5.1 (11/08/2020)
+
+**Highlights**
+
+- Support FP16 and more generalized OHEM
+
+**Bug Fixes**
+
+- Fixed Pascal VOC conversion script (#19)
+- Fixed OHEM weight assign bug (#54)
+- Fixed palette type when palette is not given (#27)
+
+**New Features**
+
+- Support FP16 (#21)
+- Generalized OHEM (#54)
+
+**Improvements**
+
+- Add load-from flag (#33)
+- Fixed training tricks doc about different learning rates of model (#26)
diff --git a/docs/en/notes/faq.md b/docs/en/notes/faq.md
new file mode 100644
index 0000000000..a3f8099685
--- /dev/null
+++ b/docs/en/notes/faq.md
@@ -0,0 +1,132 @@
+# Frequently Asked Questions (FAQ)
+
+We list some common troubles faced by many users and their corresponding solutions here. Feel free to enrich the list if you find any frequent issues and have ways to help others to solve them. If the contents here do not cover your issue, please create an issue using the [provided templates](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/.github/ISSUE_TEMPLATE/error-report.md/) and make sure you fill in all required information in the template.
+
+## Installation
+
+The compatible MMSegmentation, MMCV and MMEngine versions are as below. Please install the correct versions of them to avoid installation issues.
+
+| MMSegmentation version |          MMCV version          | MMEngine version  | MMClassification (optional) version | MMDetection (optional) version |
+| :--------------------: | :----------------------------: | :---------------: | :---------------------------------: | :----------------------------: |
+|     dev-1.x branch     |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|      main branch       |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.2.2          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.2.1          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.2.0          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.1.2          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.1.1          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.1.0          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.0.0          |        mmcv >= 2.0.0rc4        | MMEngine >= 0.7.1 |           mmcls==1.0.0rc6           |         mmdet >= 3.0.0         |
+|        1.0.0rc6        |        mmcv >= 2.0.0rc4        | MMEngine >= 0.5.0 |           mmcls>=1.0.0rc0           |       mmdet >= 3.0.0rc6        |
+|        1.0.0rc5        |        mmcv >= 2.0.0rc4        | MMEngine >= 0.2.0 |           mmcls>=1.0.0rc0           |        mmdet>=3.0.0rc6         |
+|        1.0.0rc4        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4, \<=3.0.0rc5  |
+|        1.0.0rc3        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4, \<=3.0.0rc5  |
+|        1.0.0rc2        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4, \<=3.0.0rc5  |
+|        1.0.0rc1        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |          Not required          |
+|        1.0.0rc0        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |          Not required          |
+
+Notes:
+
+- MMClassification and MMDetatction are optional for MMSegmentation. If you didn't install them, `ConvNeXt` (required MMClassification) and MaskFormer, Mask2Former (required MMDetection) cannot be used. We recommend to install them with source code. Please refer to [MMClasssication](https://github.com/open-mmlab/mmclassification) and [MMDetection](https://github.com/open-mmlab/mmdetection) for more details about their installation.
+
+- To install MMSegmentation 0.x and master branch, please refer to [the faq 0.x document](https://mmsegmentation.readthedocs.io/en/latest/faq.html#installation) to check compatible versions of MMCV.
+
+- If you have installed an incompatible version of mmcv, please run `pip uninstall mmcv` to uninstall the installed mmcv first. If you have previously installed mmcv-full (which exists in OpenMMLab 1.x), please run `pip uninstall mmcv-full` to uninstall it.
+
+- If "No module named 'mmcv'" appears, please follow the steps below;
+
+  1. Use `pip uninstall mmcv` to uninstall the existing mmcv in the environment.
+  2. Install the corresponding mmcv according to the [installation instructions](https://mmsegmentation.readthedocs.io/en/dev-1.x/get_started.html#best-practices).
+
+## How to know the number of GPUs needed to train the model
+
+- Infer from the name of the config file of the model. You can refer to the `Config Name Style` part of [Learn about Configs](../user_guides/1_config.md). For example, for config file with name `segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py`, `8xb1` means training the model corresponding to it needs 8 GPUs, and the batch size of each GPU is 1.
+- Infer from the log file. Open the log file of the model and search `nGPU` in the file. The number of figures following `nGPU` is the number of GPUs needed to train the model. For instance, searching for `nGPU` in the log file yields the record `nGPU 0,1,2,3,4,5,6,7`, which indicates that eight GPUs are needed to train the model.
+
+## What does the auxiliary head mean
+
+Briefly, it is a deep supervision trick to improve the accuracy. In the training phase, `decode_head` is for decoding semantic segmentation output, `auxiliary_head` is just adding an auxiliary loss, the segmentation result produced by it has no impact to your model's result, it just works in training. You may read this [paper](https://arxiv.org/pdf/1612.01105.pdf) for more information.
+
+## How to output the segmentation mask image when running the test script
+
+In the test script, we provide `--out` argument to control whether output the painted images. Users might run the following command:
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --out ${OUTPUT_DIR}
+```
+
+## How to handle binary segmentation task
+
+MMSegmentation uses `num_classes` and `out_channels` to control output of last layer `self.conv_seg`. More details could be found [here](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/mmseg/models/decode_heads/decode_head.py).
+
+`num_classes` should be the same as number of types of labels, in binary segmentation task, dataset only has two types of labels: foreground and background, so `num_classes=2`. `out_channels` controls the output channel of last layer of model, it usually equals to `num_classes`.
+But in binary segmentation task, there are two solutions:
+
+- Set `out_channels=2`, using Cross Entropy Loss in training, using `F.softmax()` and `argmax()` to get prediction of each pixel in inference.
+
+- Set `out_channels=1`, using Binary Cross Entropy Loss in training, using `F.sigmoid()` and `threshold` to get prediction of each pixel in inference. `threshold` is set 0.3 as default.
+
+In summary, to implement binary segmentation methods users should modify below parameters in the `decode_head` and `auxiliary_head` configs. Here is a modification example of [pspnet_unet_s5-d16.py](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/_base_/models/pspnet_unet_s5-d16.py):
+
+- (1) `num_classes=2`, `out_channels=2` and `use_sigmoid=False` in `CrossEntropyLoss`.
+
+```python
+decode_head=dict(
+    type='PSPHead',
+    in_channels=64,
+    in_index=4,
+    num_classes=2,
+    out_channels=2,
+    loss_decode=dict(
+        type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+auxiliary_head=dict(
+    type='FCNHead',
+    in_channels=128,
+    in_index=3,
+    num_classes=2,
+    out_channels=2,
+    loss_decode=dict(
+        type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+```
+
+- (2) `num_classes=2`, `out_channels=1` and `use_sigmoid=True` in `CrossEntropyLoss`.
+
+```python
+decode_head=dict(
+    type='PSPHead',
+    in_channels=64,
+    in_index=4,
+    num_classes=2,
+    out_channels=1,
+    loss_decode=dict(
+        type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+auxiliary_head=dict(
+    type='FCNHead',
+    in_channels=128,
+    in_index=3,
+    num_classes=2,
+    out_channels=1,
+    loss_decode=dict(
+        type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+```
+
+## Functionality of `reduce_zero_label`
+
+The parameter type of `reduce_zero_label` in dataset is Boolean, which is default to False. It is used to ignore the dataset label 0. The specific method is to change label 0 to 255, and subtract 1 from the corresponding number of all the remaining labels. At the same time, set 255 as ignore index in the decode head, which means that it will not participate in the loss calculation.
+
+Following is the specific implementation logic of `reduce_zero_label`:
+
+```python
+if self.reduce_zero_label:
+    # avoid using underflow conversion
+    gt_semantic_seg[gt_semantic_seg == 0] = 255
+    gt_semantic_seg = gt_semantic_seg - 1
+    gt_semantic_seg[gt_semantic_seg == 254] = 255
+```
+
+Whether your dataset needs to use `reduce_zero_label`, there are two types of situations:
+
+- On [Potsdam](https://github.com/open-mmlab/mmsegmentation/blob/1.x/docs/en/user_guides/2_dataset_prepare.md#isprs-potsdam) dataset, there are six classes: 0-Impervious surfaces, 1-Building, 2-Low vegetation, 3-Tree, 4-Car, 5-Clutter/background. However, this dataset provides two types of RGB labels, one with black pixels at the edges of the images, and the other without. For labels with black edges, in [dataset_converters.py](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/tools/dataset_converters/potsdam.py), it converts the black edges to label 0, and the other labels are 1-Impervious surfaces, 2-Building, 3-Low vegetation, 4-Tree, 5-Car, 6-Clutter/background. Therefore, in the dataset config [potsdam.py](https://github.com/open-mmlab/mmsegmentation/blob/ff95416c3b5ce8d62b9289f743531398efce534f/mmseg/datasets/potsdam.py#L23) `reduce_zero_label=True`。 If you are using labels without black edges, then there are only class 0-5 in the mask label. At this point, you should use `reduce_zero_label=False`. `reduce_zero_label` usage needs to be considered with your actual situation.
+- On a dataset with class 0 as the background class, if you need to separate the background from the rest of your classes ultimately then you do not need to use `reduce_zero_label`, which in the dataset config settings should be `reduce_zero_label=False`
+
+**Note:** Please confirm the number of original classes in the dataset. If there are only two classes, you should not use `reduce_zero_label` which is `reduce_zero_label=False`.
diff --git a/docs/en/overview.md b/docs/en/overview.md
new file mode 100644
index 0000000000..bbc0b8e32c
--- /dev/null
+++ b/docs/en/overview.md
@@ -0,0 +1,85 @@
+# Overview
+
+This chapter introduces you to the framework of MMSegmentation, and the basic conception of semantic segmentation. It also provides links to detailed tutorials about MMSegmentation.
+
+## What is semantic segmentation?
+
+Semantic segmentation is the task of clustering parts of an image together that belong to the same object class.
+It is a form of pixel-level prediction because each pixel in an image is classified according to a category.
+Some example benchmarks for this task are [Cityscapes](https://www.cityscapes-dataset.com/benchmarks/), [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/) and [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/).
+Models are usually evaluated with the Mean Intersection-Over-Union (Mean IoU) and Pixel Accuracy metrics.
+
+## What is MMSegmentation?
+
+MMSegmentation is a toolbox that provides a framework for unified implementation and evaluation of semant
+ic segmentation methods,
+and contains high-quality implementations of popular semantic segmentation methods and datasets.
+
+MMSeg consists of 7 main parts including apis, structures, datasets, models, engine, evaluation and visualization.
+
+- **apis** provides high-level APIs for model inference.
+
+- **structures** provides segmentation data structure `SegDataSample`.
+
+- **datasets** supports various datasets for semantic segmentation.
+
+  - **transforms** contains a lot of useful data augmentation transforms.
+
+- **models** is the most vital part for segmentors and contains different components of a segmentor.
+
+  - **segmentors** defines all of the segmentation model classes.
+  - **data_preprocessors** works for preprocessing the input data of the model.
+  - **backbones** contains various backbone networks that transform an image to feature maps.
+  - **necks** contains various neck components that connect the backbone and heads.
+  - **decode_heads** contains various head components that take feature map as input and predict segmentation results.
+  - **losses** contains various loss functions.
+
+- **engine** is a part for runtime components that extends function of [MMEngine](https://github.com/open-mmlab/mmengine).
+
+  - **optimizers** provides optimizers and optimizer wrappers.
+  - **hooks** provides various hooks of the runner.
+
+- **evaluation** provides different metrics for evaluating model performance.
+
+- **visualization** is for visualizing segmentation results.
+
+## How to use this documentation
+
+Here is a detailed step-by-step guide to learn more about MMSegmentation:
+
+1. For installation instructions, please see [get_started](getting_started.md).
+
+2. For beginners, MMSegmentation is the best place to start the journey of semantic segmentation
+   as there are many SOTA and classic segmentation [models](model_zoo.md),
+   and it is easier to carry out a segmentation task by plugging together building blocks and convenient high-level apis.
+   Refer to the tutorials below for the basic usage of MMSegmentation:
+
+   - [Config](user_guides/1_config.md)
+   - [Dataset Preparation](user_guides/2_dataset_prepare.md)
+   - [Inference](user_guides/3_inference.md)
+   - [Train and Test](user_guides/4_train_test.md)
+
+3. If you would like to learn about the fundamental classes and features that make MMSegmentation work,
+   please refer to the tutorials below to dive deeper:
+
+   - [Data flow](advanced_guides/data_flow.md)
+   - [Structures](advanced_guides/structures.md)
+   - [Models](advanced_guides/models.md)
+   - [Datasets](advanced_guides/datasets.md)
+   - [Evaluation](advanced_guides/evaluation.md)
+
+4. MMSegmentation also provide tutorials for customization and advanced research,
+   please refer to the below guides to build your own segmentation project:
+
+   - [Add new models](advanced_guides/add_models.md)
+   - [Add new datasets](advanced_guides/add_datasets.md)
+   - [Add new transforms](advanced_guides/add_transforms.md)
+   - [Customize runtime](advanced_guides/customize_runtime.md)
+
+5. If you are more familiar with MMSegmentation v0.x, there is documentation about migration from MMSegmentation v0.x to v1.x
+
+   - [migration](migration/index.rst)
+
+## References
+
+- [Paper with code](https://paperswithcode.com/task/semantic-segmentation/codeless#task-home)
diff --git a/docs/en/stat.py b/docs/en/stat.py
index 1398a706f3..c458ee3c1e 100755
--- a/docs/en/stat.py
+++ b/docs/en/stat.py
@@ -18,13 +18,15 @@
 for f in files:
     url = osp.dirname(f.replace('../../', url_prefix))
 
-    with open(f, 'r') as content_file:
+    with open(f) as content_file:
         content = content_file.read()
 
     title = content.split('\n')[0].replace('#', '').strip()
-    ckpts = set(x.lower().strip()
-                for x in re.findall(r'https?://download.*\.pth', content)
-                if 'mmsegmentation' in x)
+    ckpts = {
+        x.lower().strip()
+        for x in re.findall(r'https?://download.*\.pth', content)
+        if 'mmsegmentation' in x
+    }
     if len(ckpts) == 0:
         continue
 
@@ -34,7 +36,7 @@
     assert len(_papertype) > 0
     papertype = _papertype[0]
 
-    paper = set([(papertype, title)])
+    paper = {(papertype, title)}
 
     titles.append(title)
     num_ckpts += len(ckpts)
diff --git a/docs/en/train.md b/docs/en/train.md
deleted file mode 100644
index 7c1c411848..0000000000
--- a/docs/en/train.md
+++ /dev/null
@@ -1,169 +0,0 @@
-## Train a model
-
-MMSegmentation implements distributed training and non-distributed training,
-which uses `MMDistributedDataParallel` and `MMDataParallel` respectively.
-
-All outputs (log files and checkpoints) will be saved to the working directory,
-which is specified by `work_dir` in the config file.
-
-By default we evaluate the model on the validation set after some iterations, you can change the evaluation interval by adding the interval argument in the training config.
-
-```python
-evaluation = dict(interval=4000)  # This evaluate the model per 4000 iterations.
-```
-
-**\*Important\***: The default learning rate in config files is for 4 GPUs and 2 img/gpu (batch size = 4x2 = 8).
-Equivalently, you may also use 8 GPUs and 1 imgs/gpu since all models using cross-GPU SyncBN.
-
-To trade speed with GPU memory, you may pass in `--cfg-options model.backbone.with_cp=True` to enable checkpoint in backbone.
-
-### Train on a single machine
-
-#### Train with a single GPU
-
-official support:
-
-```shell
-sh tools/dist_train.sh ${CONFIG_FILE} 1 [optional arguments]
-```
-
-experimental support (Convert SyncBN to BN):
-
-```shell
-python tools/train.py ${CONFIG_FILE} [optional arguments]
-```
-
-If you want to specify the working directory in the command, you can add an argument `--work-dir ${YOUR_WORK_DIR}`.
-
-#### Train with CPU
-
-The process of training on the CPU is consistent with single GPU training if machine does not have GPU. If it has GPUs but not wanting to use it, we just need to disable GPUs before the training process.
-
-```shell
-export CUDA_VISIBLE_DEVICES=-1
-```
-
-And then run the script [above](#train-with-a-single-gpu).
-
-```{warning}
-The process of training on the CPU is consistent with single GPU training. We just need to disable GPUs before the training process.
-```
-
-#### Train with multiple GPUs
-
-```shell
-sh tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
-```
-
-Optional arguments are:
-
-- `--no-validate` (**not suggested**): By default, the codebase will perform evaluation at every k iterations during the training. To disable this behavior, use `--no-validate`.
-- `--work-dir ${WORK_DIR}`: Override the working directory specified in the config file.
-- `--resume-from ${CHECKPOINT_FILE}`: Resume from a previous checkpoint file (to continue the training process).
-- `--load-from ${CHECKPOINT_FILE}`: Load weights from a checkpoint file (to start finetuning for another task).
-- `--deterministic`: Switch on "deterministic" mode which slows down training but the results are reproducible.
-
-Difference between `resume-from` and `load-from`:
-
-- `resume-from` loads both the model weights and optimizer state including the iteration number.
-- `load-from` loads only the model weights, starts the training from iteration 0.
-
-An example:
-
-```shell
-# checkpoints and logs saved in WORK_DIR=work_dirs/pspnet_r50-d8_512x512_80k_ade20k/
-# If work_dir is not set, it will be generated automatically.
-sh tools/dist_train.sh configs/pspnet/pspnet_r50-d8_512x512_80k_ade20k.py 8 --work_dir work_dirs/pspnet_r50-d8_512x512_80k_ade20k/ --deterministic
-```
-
-**Note**: During training, checkpoints and logs are saved in the same folder structure as the config file under `work_dirs/`. Custom work directory is not recommended since evaluation scripts infer work directories from the config file name. If you want to save your weights somewhere else, please use symlink, for example:
-
-```shell
-ln -s ${YOUR_WORK_DIRS} ${MMSEG}/work_dirs
-```
-
-#### Launch multiple jobs on a single machine
-
-If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs, you need to specify different ports (29500 by default) for each job to avoid communication conflict. Otherwise, there will be error message saying `RuntimeError: Address already in use`.
-
-If you use `dist_train.sh` to launch training jobs, you can set the port in commands with environment variable `PORT`.
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 sh tools/dist_train.sh ${CONFIG_FILE} 4
-CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 sh tools/dist_train.sh ${CONFIG_FILE} 4
-```
-
-### Train with multiple machines
-
-If you launch with multiple machines simply connected with ethernet, you can simply run following commands:
-
-On the first machine:
-
-```shell
-NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
-```
-
-On the second machine:
-
-```shell
-NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
-```
-
-Usually it is slow if you do not have high speed networking like InfiniBand.
-
-### Manage jobs with Slurm
-
-Slurm is a good job scheduling system for computing clusters. On a cluster managed by Slurm, you can use slurm_train.sh to spawn training jobs. It supports both single-node and multi-node training.
-
-Train with multiple machines:
-
-```shell
-[GPUS=${GPUS}] sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} --work-dir ${WORK_DIR}
-```
-
-Here is an example of using 16 GPUs to train PSPNet on the dev partition.
-
-```shell
-GPUS=16 sh tools/slurm_train.sh dev pspr50 configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py work_dirs/pspnet_r50-d8_512x1024_40k_cityscapes/
-```
-
-When using 'slurm_train.sh' to start multiple tasks on a node, different ports need to be specified. Three settings are provided:
-
-Option 1:
-
-In `config1.py`:
-
-```python
-dist_params = dict(backend='nccl', port=29500)
-```
-
-In `config2.py`:
-
-```python
-dist_params = dict(backend='nccl', port=29501)
-```
-
-Then you can launch two jobs with config1.py and config2.py.
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py tmp_work_dir_1
-CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py tmp_work_dir_2
-```
-
-Option 2:
-
-You can set different communication ports without the need to modify the configuration file, but have to set the `cfg-options` to overwrite the default port in configuration file.
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py tmp_work_dir_1 --cfg-options dist_params.port=29500
-CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py tmp_work_dir_2 --cfg-options dist_params.port=29501
-```
-
-Option 3:
-
-You can set the port in the command using the environment variable 'MASTER_PORT':
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 MASTER_PORT=29500 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py tmp_work_dir_1
-CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 MASTER_PORT=29501 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py tmp_work_dir_2
-```
diff --git a/docs/en/tutorials/config.md b/docs/en/tutorials/config.md
deleted file mode 100644
index 2db5469609..0000000000
--- a/docs/en/tutorials/config.md
+++ /dev/null
@@ -1,381 +0,0 @@
-# Tutorial 1: Learn about Configs
-
-We incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments.
-If you wish to inspect the config file, you may run `python tools/print_config.py /PATH/TO/CONFIG` to see the complete config.
-You may also pass `--cfg-options xxx.yyy=zzz` to see updated config.
-
-## Config File Structure
-
-There are 4 basic component types under `config/_base_`, dataset, model, schedule, default_runtime.
-Many methods could be easily constructed with one of each like DeepLabV3, PSPNet.
-The configs that are composed by components from `_base_` are called _primitive_.
-
-For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should inherit from the _primitive_ config. In this way, the maximum of inheritance level is 3.
-
-For easy understanding, we recommend contributors to inherit from existing methods.
-For example, if some modification is made base on DeepLabV3, user may first inherit the basic DeepLabV3 structure by specifying `_base_ = ../deeplabv3/deeplabv3_r50_512x1024_40ki_cityscapes.py`, then modify the necessary fields in the config files.
-
-If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder `xxxnet` under `configs`,
-
-Please refer to [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html) for detailed documentation.
-
-## Config Name Style
-
-We follow the below style to name config files. Contributors are advised to follow the same style.
-
-```
-{model}_{backbone}_[misc]_[gpu x batch_per_gpu]_{resolution}_{iterations}_{dataset}
-```
-
-`{xxx}` is required field and `[yyy]` is optional.
-
-- `{model}`: model type like `psp`, `deeplabv3`, etc.
-- `{backbone}`: backbone type like `r50` (ResNet-50), `x101` (ResNeXt-101).
-- `[misc]`: miscellaneous setting/plugins of model, e.g. `dconv`, `gcb`, `attention`, `mstrain`.
-- `[gpu x batch_per_gpu]`: GPUs and samples per GPU, `8x2` is used by default.
-- `{iterations}`: number of training iterations like `160k`.
-- `{dataset}`: dataset like `cityscapes`, `voc12aug`, `ade`.
-
-## An Example of PSPNet
-
-To help the users have a basic idea of a complete config and the modules in a modern semantic segmentation system,
-we make brief comments on the config of PSPNet using ResNet50V1c as the following.
-For more detailed usage and the corresponding alternative for each module, please refer to the API documentation.
-
-```python
-norm_cfg = dict(type='SyncBN', requires_grad=True)  # Segmentation usually uses SyncBN
-model = dict(
-    type='EncoderDecoder',  # Name of segmentor
-    pretrained='open-mmlab://resnet50_v1c',  # The ImageNet pretrained backbone to be loaded
-    backbone=dict(
-        type='ResNetV1c',  # The type of backbone. Please refer to mmseg/models/backbones/resnet.py for details.
-        depth=50,  # Depth of backbone. Normally 50, 101 are used.
-        num_stages=4,  # Number of stages of backbone.
-        out_indices=(0, 1, 2, 3),  # The index of output feature maps produced in each stages.
-        dilations=(1, 1, 2, 4),  # The dilation rate of each layer.
-        strides=(1, 2, 1, 1),  # The stride of each layer.
-        norm_cfg=dict(  # The configuration of norm layer.
-            type='SyncBN',  # Type of norm layer. Usually it is SyncBN.
-            requires_grad=True),   # Whether to train the gamma and beta in norm
-        norm_eval=False,  # Whether to freeze the statistics in BN
-        style='pytorch',  # The style of backbone, 'pytorch' means that stride 2 layers are in 3x3 conv, 'caffe' means stride 2 layers are in 1x1 convs.
-        contract_dilation=True),  # When dilation > 1, whether contract first layer of dilation.
-    decode_head=dict(
-        type='PSPHead',  # Type of decode head. Please refer to mmseg/models/decode_heads for available options.
-        in_channels=2048,  # Input channel of decode head.
-        in_index=3,  # The index of feature map to select.
-        channels=512,  # The intermediate channels of decode head.
-        pool_scales=(1, 2, 3, 6),  # The avg pooling scales of PSPHead. Please refer to paper for details.
-        dropout_ratio=0.1,  # The dropout ratio before final classification layer.
-        num_classes=19,  # Number of segmentation class. Usually 19 for cityscapes, 21 for VOC, 150 for ADE20k.
-        norm_cfg=dict(type='SyncBN', requires_grad=True),  # The configuration of norm layer.
-        align_corners=False,  # The align_corners argument for resize in decoding.
-        loss_decode=dict(  # Config of loss function for the decode_head.
-            type='CrossEntropyLoss',  # Type of loss used for segmentation.
-            use_sigmoid=False,  # Whether use sigmoid activation for segmentation.
-            loss_weight=1.0)),  # Loss weight of decode head.
-    auxiliary_head=dict(
-        type='FCNHead',  # Type of auxiliary head. Please refer to mmseg/models/decode_heads for available options.
-        in_channels=1024,  # Input channel of auxiliary head.
-        in_index=2,  # The index of feature map to select.
-        channels=256,  # The intermediate channels of decode head.
-        num_convs=1,  # Number of convs in FCNHead. It is usually 1 in auxiliary head.
-        concat_input=False,  # Whether concat output of convs with input before classification layer.
-        dropout_ratio=0.1,  # The dropout ratio before final classification layer.
-        num_classes=19,  # Number of segmentation class. Usually 19 for cityscapes, 21 for VOC, 150 for ADE20k.
-        norm_cfg=dict(type='SyncBN', requires_grad=True),  # The configuration of norm layer.
-        align_corners=False,  # The align_corners argument for resize in decoding.
-        loss_decode=dict(  # Config of loss function for the decode_head.
-            type='CrossEntropyLoss',  # Type of loss used for segmentation.
-            use_sigmoid=False,  # Whether use sigmoid activation for segmentation.
-            loss_weight=0.4)))  # Loss weight of auxiliary head, which is usually 0.4 of decode head.
-train_cfg = dict()  # train_cfg is just a place holder for now.
-test_cfg = dict(mode='whole')  # The test mode, options are 'whole' and 'sliding'. 'whole': whole image fully-convolutional test. 'sliding': sliding crop window on the image.
-dataset_type = 'CityscapesDataset'  # Dataset type, this will be used to define the dataset.
-data_root = 'data/cityscapes/'  # Root path of data.
-img_norm_cfg = dict(  # Image normalization config to normalize the input images.
-    mean=[123.675, 116.28, 103.53],  # Mean values used to pre-training the pre-trained backbone models.
-    std=[58.395, 57.12, 57.375],  # Standard variance used to pre-training the pre-trained backbone models.
-    to_rgb=True)  # The channel orders of image used to pre-training the pre-trained backbone models.
-crop_size = (512, 1024)  # The crop size during training.
-train_pipeline = [  # Training pipeline.
-    dict(type='LoadImageFromFile'),  # First pipeline to load images from file path.
-    dict(type='LoadAnnotations'),  # Second pipeline to load annotations for current image.
-    dict(type='Resize',  # Augmentation pipeline that resize the images and their annotations.
-        img_scale=(2048, 1024),  # The largest scale of image.
-        ratio_range=(0.5, 2.0)), # The augmented scale range as ratio.
-    dict(type='RandomCrop',  # Augmentation pipeline that randomly crop a patch from current image.
-        crop_size=(512, 1024),  # The crop size of patch.
-        cat_max_ratio=0.75),  # The max area ratio that could be occupied by single category.
-    dict(
-        type='RandomFlip',  # Augmentation pipeline that flip the images and their annotations
-        flip_ratio=0.5),  # The ratio or probability to flip
-    dict(type='PhotoMetricDistortion'),  # Augmentation pipeline that distort current image with several photo metric methods.
-    dict(
-        type='Normalize',  # Augmentation pipeline that normalize the input images
-        mean=[123.675, 116.28, 103.53],  # These keys are the same of img_norm_cfg since the
-        std=[58.395, 57.12, 57.375],  # keys of img_norm_cfg are used here as arguments
-        to_rgb=True),
-    dict(type='Pad',  # Augmentation pipeline that pad the image to specified size.
-        size=(512, 1024),  # The output size of padding.
-        pad_val=0,  # The padding value for image.
-        seg_pad_val=255),  # The padding value of 'gt_semantic_seg'.
-    dict(type='DefaultFormatBundle'),  # Default format bundle to gather data in the pipeline
-    dict(type='Collect',  # Pipeline that decides which keys in the data should be passed to the segmentor
-        keys=['img', 'gt_semantic_seg'])
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),  # First pipeline to load images from file path
-    dict(
-        type='MultiScaleFlipAug',  # An encapsulation that encapsulates the test time augmentations
-        img_scale=(2048, 1024),  # Decides the largest scale for testing, used for the Resize pipeline
-        flip=False,  # Whether to flip images during testing
-        transforms=[
-            dict(type='Resize',  # Use resize augmentation
-                 keep_ratio=True),  # Whether to keep the ratio between height and width, the img_scale set here will be suppressed by the img_scale set above.
-            dict(type='RandomFlip'),  # Thought RandomFlip is added in pipeline, it is not used when flip=False
-            dict(
-                type='Normalize',  # Normalization config, the values are from img_norm_cfg
-                mean=[123.675, 116.28, 103.53],
-                std=[58.395, 57.12, 57.375],
-                to_rgb=True),
-            dict(type='ImageToTensor', # Convert image to tensor
-                keys=['img']),
-            dict(type='Collect', # Collect pipeline that collect necessary keys for testing.
-                keys=['img'])
-        ])
-]
-data = dict(
-    samples_per_gpu=2,  # Batch size of a single GPU
-    workers_per_gpu=2,  # Worker to pre-fetch data for each single GPU
-    train=dict(  # Train dataset config
-        type='CityscapesDataset',  # Type of dataset, refer to mmseg/datasets/ for details.
-        data_root='data/cityscapes/',  # The root of dataset.
-        img_dir='leftImg8bit/train',  # The image directory of dataset.
-        ann_dir='gtFine/train',  # The annotation directory of dataset.
-        pipeline=[  # pipeline, this is passed by the train_pipeline created before.
-            dict(type='LoadImageFromFile'),
-            dict(type='LoadAnnotations'),
-            dict(
-                type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
-            dict(type='RandomCrop', crop_size=(512, 1024), cat_max_ratio=0.75),
-            dict(type='RandomFlip', flip_ratio=0.5),
-            dict(type='PhotoMetricDistortion'),
-            dict(
-                type='Normalize',
-                mean=[123.675, 116.28, 103.53],
-                std=[58.395, 57.12, 57.375],
-                to_rgb=True),
-            dict(type='Pad', size=(512, 1024), pad_val=0, seg_pad_val=255),
-            dict(type='DefaultFormatBundle'),
-            dict(type='Collect', keys=['img', 'gt_semantic_seg'])
-        ]),
-    val=dict(  # Validation dataset config
-        type='CityscapesDataset',
-        data_root='data/cityscapes/',
-        img_dir='leftImg8bit/val',
-        ann_dir='gtFine/val',
-        pipeline=[  # Pipeline is passed by test_pipeline created before
-            dict(type='LoadImageFromFile'),
-            dict(
-                type='MultiScaleFlipAug',
-                img_scale=(2048, 1024),
-                flip=False,
-                transforms=[
-                    dict(type='Resize', keep_ratio=True),
-                    dict(type='RandomFlip'),
-                    dict(
-                        type='Normalize',
-                        mean=[123.675, 116.28, 103.53],
-                        std=[58.395, 57.12, 57.375],
-                        to_rgb=True),
-                    dict(type='ImageToTensor', keys=['img']),
-                    dict(type='Collect', keys=['img'])
-                ])
-        ]),
-    test=dict(
-        type='CityscapesDataset',
-        data_root='data/cityscapes/',
-        img_dir='leftImg8bit/val',
-        ann_dir='gtFine/val',
-        pipeline=[
-            dict(type='LoadImageFromFile'),
-            dict(
-                type='MultiScaleFlipAug',
-                img_scale=(2048, 1024),
-                flip=False,
-                transforms=[
-                    dict(type='Resize', keep_ratio=True),
-                    dict(type='RandomFlip'),
-                    dict(
-                        type='Normalize',
-                        mean=[123.675, 116.28, 103.53],
-                        std=[58.395, 57.12, 57.375],
-                        to_rgb=True),
-                    dict(type='ImageToTensor', keys=['img']),
-                    dict(type='Collect', keys=['img'])
-                ])
-        ]))
-log_config = dict(  # config to register logger hook
-    interval=50,  # Interval to print the log
-    hooks=[
-        # dict(type='TensorboardLoggerHook')  # The Tensorboard logger is also supported
-        dict(type='TextLoggerHook', by_epoch=False)
-    ])
-dist_params = dict(backend='nccl')  # Parameters to setup distributed training, the port can also be set.
-log_level = 'INFO'  # The level of logging.
-load_from = None  # load models as a pre-trained model from a given path. This will not resume training.
-resume_from = None  # Resume checkpoints from a given path, the training will be resumed from the iteration when the checkpoint's is saved.
-workflow = [('train', 1)]  # Workflow for runner. [('train', 1)] means there is only one workflow and the workflow named 'train' is executed once. The workflow trains the model by 40000 iterations according to the `runner.max_iters`.
-cudnn_benchmark = True  # Whether use cudnn_benchmark to speed up, which is fast for fixed input size.
-optimizer = dict(  # Config used to build optimizer, support all the optimizers in PyTorch whose arguments are also the same as those in PyTorch
-    type='SGD',  # Type of optimizers, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/optimizer/default_constructor.py#L13 for more details
-    lr=0.01,  # Learning rate of optimizers, see detail usages of the parameters in the documentation of PyTorch
-    momentum=0.9,  # Momentum
-    weight_decay=0.0005)  # Weight decay of SGD
-optimizer_config = dict()  # Config used to build the optimizer hook, refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/optimizer.py#L8 for implementation details.
-lr_config = dict(
-    policy='poly',  # The policy of scheduler, also support Step, CosineAnnealing, Cyclic, etc. Refer to details of supported LrUpdater from https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py#L9.
-    power=0.9,  # The power of polynomial decay.
-    min_lr=0.0001,  # The minimum learning rate to stable the training.
-    by_epoch=False)  # Whether count by epoch or not.
-runner = dict(
-    type='IterBasedRunner', # Type of runner to use (i.e. IterBasedRunner or EpochBasedRunner)
-    max_iters=40000) # Total number of iterations. For EpochBasedRunner use `max_epochs`
-checkpoint_config = dict(  # Config to set the checkpoint hook, Refer to https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py for implementation.
-    by_epoch=False,  # Whether count by epoch or not.
-    interval=4000)  # The save interval.
-evaluation = dict(  # The config to build the evaluation hook. Please refer to mmseg/core/evaluation/eval_hook.py for details.
-    interval=4000,  # The interval of evaluation.
-    metric='mIoU')  # The evaluation metric.
-
-
-```
-
-## FAQ
-
-### Ignore some fields in the base configs
-
-Sometimes, you may set `_delete_=True` to ignore some of the fields in base configs.
-You may refer to [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html#inherit-from-base-config-with-ignored-fields) for simple illustration.
-
-In MMSegmentation, for example, to change the backbone of PSPNet with the following config.
-
-```python
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='MaskRCNN',
-    pretrained='torchvision://resnet50',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(...),
-    auxiliary_head=dict(...))
-```
-
-`ResNet` and `HRNet` use different keywords to construct.
-
-```python
-_base_ = '../pspnet/psp_r50_512x1024_40ki_cityscpaes.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w32',
-    backbone=dict(
-        _delete_=True,
-        type='HRNet',
-        norm_cfg=norm_cfg,
-        extra=dict(
-            stage1=dict(
-                num_modules=1,
-                num_branches=1,
-                block='BOTTLENECK',
-                num_blocks=(4, ),
-                num_channels=(64, )),
-            stage2=dict(
-                num_modules=1,
-                num_branches=2,
-                block='BASIC',
-                num_blocks=(4, 4),
-                num_channels=(32, 64)),
-            stage3=dict(
-                num_modules=4,
-                num_branches=3,
-                block='BASIC',
-                num_blocks=(4, 4, 4),
-                num_channels=(32, 64, 128)),
-            stage4=dict(
-                num_modules=3,
-                num_branches=4,
-                block='BASIC',
-                num_blocks=(4, 4, 4, 4),
-                num_channels=(32, 64, 128, 256)))),
-    decode_head=dict(...),
-    auxiliary_head=dict(...))
-```
-
-The `_delete_=True` would replace all old keys in `backbone` field with new keys.
-
-### Use intermediate variables in configs
-
-Some intermediate variables are used in the configs files, like `train_pipeline`/`test_pipeline` in datasets.
-It's worth noting that when modifying intermediate variables in the children configs, user need to pass the intermediate variables into corresponding fields again.
-For example, we would like to change multi scale strategy to train/test a PSPNet. `train_pipeline`/`test_pipeline` are intermediate variable we would like to modify.
-
-```python
-_base_ = '../pspnet/psp_r50_512x1024_40ki_cityscapes.py'
-crop_size = (512, 1024)
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations'),
-    dict(type='Resize', img_scale=(2048, 1024), ratio_range=(1.0, 2.0)),  # change to [1., 2.]
-    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='PhotoMetricDistortion'),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(2048, 1024),
-        img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],  # change to multi scale testing
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
-```
-
-We first define the new `train_pipeline`/`test_pipeline` and pass them into `data`.
-
-Similarly, if we would like to switch from `SyncBN` to `BN` or `MMSyncBN`, we need to substitute every `norm_cfg` in the config.
-
-```python
-_base_ = '../pspnet/psp_r50_512x1024_40ki_cityscpaes.py'
-norm_cfg = dict(type='BN', requires_grad=True)
-model = dict(
-    backbone=dict(norm_cfg=norm_cfg),
-    decode_head=dict(norm_cfg=norm_cfg),
-    auxiliary_head=dict(norm_cfg=norm_cfg))
-```
diff --git a/docs/en/tutorials/customize_datasets.md b/docs/en/tutorials/customize_datasets.md
deleted file mode 100644
index de906d5fd1..0000000000
--- a/docs/en/tutorials/customize_datasets.md
+++ /dev/null
@@ -1,290 +0,0 @@
-# Tutorial 2: Customize Datasets
-
-## Data configuration
-
-`data` in config file is the variable for data configuration, to define the arguments that are used in datasets and dataloaders.
-
-Here is an example of data configuration:
-
-```python
-data = dict(
-    samples_per_gpu=4,
-    workers_per_gpu=4,
-    train=dict(
-        type='ADE20KDataset',
-        data_root='data/ade/ADEChallengeData2016',
-        img_dir='images/training',
-        ann_dir='annotations/training',
-        pipeline=train_pipeline),
-    val=dict(
-        type='ADE20KDataset',
-        data_root='data/ade/ADEChallengeData2016',
-        img_dir='images/validation',
-        ann_dir='annotations/validation',
-        pipeline=test_pipeline),
-    test=dict(
-        type='ADE20KDataset',
-        data_root='data/ade/ADEChallengeData2016',
-        img_dir='images/validation',
-        ann_dir='annotations/validation',
-        pipeline=test_pipeline))
-```
-
-- `train`, `val` and `test`: The [`config`](https://github.com/open-mmlab/mmcv/blob/master/docs/en/understand_mmcv/config.md)s to build dataset instances for model training, validation and testing by
-  using [`build and registry`](https://github.com/open-mmlab/mmcv/blob/master/docs/en/understand_mmcv/registry.md) mechanism.
-
-- `samples_per_gpu`: How many samples per batch and per gpu to load during model training, and the `batch_size` of training is equal to `samples_per_gpu` times gpu number, e.g. when using 8 gpus for distributed data parallel trainig and `samples_per_gpu=4`, the `batch_size` is `8*4=16`.
-  If you would like to define `batch_size` for testing and validation, please use `test_dataloaser` and
-  `val_dataloader` with mmseg >=0.24.1.
-
-- `workers_per_gpu`: How many subprocesses per gpu to use for data loading. `0` means that the data will be loaded in the main process.
-
-**Note:** `samples_per_gpu` only works for model training, and the default setting of `samples_per_gpu` is 1 in mmseg when model testing and validation (DO NOT support batch inference yet).
-
-**Note:** before v0.24.1, except `train`, `val` `test`, `samples_per_gpu` and `workers_per_gpu`, the other keys in `data` must be the
-input keyword arguments for `dataloader` in pytorch, and the dataloaders used for model training, validation and testing have the same input arguments.
-In v0.24.1, mmseg supports to use `train_dataloader`, `test_dataloaser` and `val_dataloader` to specify different keyword arguments, and still supports the overall arguments definition but the specific dataloader setting has a higher priority.
-
-Here is an example for specific dataloader:
-
-```python
-data = dict(
-    samples_per_gpu=4,
-    workers_per_gpu=4,
-    shuffle=True,
-    train=dict(type='xxx', ...),
-    val=dict(type='xxx', ...),
-    test=dict(type='xxx', ...),
-    # Use different batch size during validation and testing.
-    val_dataloader=dict(samples_per_gpu=1, workers_per_gpu=4, shuffle=False),
-    test_dataloader=dict(samples_per_gpu=1, workers_per_gpu=4, shuffle=False))
-```
-
-Assume only one gpu used for model training and testing, as the priority of the overall arguments definition is low, the batch_size
-for training is `4` and dataset will be shuffled, and batch_size for testing and validation is `1`, and dataset will not be shuffled.
-
-To make data configuration much clearer, we recommend use specific dataloader setting instead of overall dataloader setting after v0.24.1, just like:
-
-```python
-data = dict(
-    train=dict(type='xxx', ...),
-    val=dict(type='xxx', ...),
-    test=dict(type='xxx', ...),
-    # Use specific dataloader setting
-    train_dataloader=dict(samples_per_gpu=4, workers_per_gpu=4, shuffle=True),
-    val_dataloader=dict(samples_per_gpu=1, workers_per_gpu=4, shuffle=False),
-    test_dataloader=dict(samples_per_gpu=1, workers_per_gpu=4, shuffle=False))
-```
-
-**Note:** in model training, default values in the script of mmseg for dataloader are `shuffle=True, and drop_last=True`,
-in model validation and testing, default values are `shuffle=False, and drop_last=False`
-
-## Customize datasets by reorganizing data
-
-The simplest way is to convert your dataset to organize your data into folders.
-
-An example of file structure is as followed.
-
-```none
-├── data
-│   ├── my_dataset
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   │   ├── xxx{img_suffix}
-│   │   │   │   ├── yyy{img_suffix}
-│   │   │   │   ├── zzz{img_suffix}
-│   │   │   ├── val
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   │   ├── xxx{seg_map_suffix}
-│   │   │   │   ├── yyy{seg_map_suffix}
-│   │   │   │   ├── zzz{seg_map_suffix}
-│   │   │   ├── val
-
-```
-
-A training pair will consist of the files with same suffix in img_dir/ann_dir.
-
-If `split` argument is given, only part of the files in img_dir/ann_dir will be loaded.
-We may specify the prefix of files we would like to be included in the split txt.
-
-More specifically, for a split txt like following,
-
-```none
-xxx
-zzz
-```
-
-Only
-`data/my_dataset/img_dir/train/xxx{img_suffix}`,
-`data/my_dataset/img_dir/train/zzz{img_suffix}`,
-`data/my_dataset/ann_dir/train/xxx{seg_map_suffix}`,
-`data/my_dataset/ann_dir/train/zzz{seg_map_suffix}` will be loaded.
-
-:::{note}
-The annotations are images of shape (H, W), the value pixel should fall in range `[0, num_classes - 1]`.
-You may use `'P'` mode of [pillow](https://pillow.readthedocs.io/en/stable/handbook/concepts.html#palette) to create your annotation image with color.
-:::
-
-## Customize datasets by mixing dataset
-
-MMSegmentation also supports to mix dataset for training.
-Currently it supports to concat, repeat and multi-image mix datasets.
-
-### Repeat dataset
-
-We use `RepeatDataset` as wrapper to repeat the dataset.
-For example, suppose the original dataset is `Dataset_A`, to repeat it, the config looks like the following
-
-```python
-dataset_A_train = dict(
-        type='RepeatDataset',
-        times=N,
-        dataset=dict(  # This is the original config of Dataset_A
-            type='Dataset_A',
-            ...
-            pipeline=train_pipeline
-        )
-    )
-```
-
-### Concatenate dataset
-
-There 2 ways to concatenate the dataset.
-
-1. If the datasets you want to concatenate are in the same type with different annotation files,
-   you can concatenate the dataset configs like the following.
-
-   1. You may concatenate two `ann_dir`.
-
-      ```python
-      dataset_A_train = dict(
-          type='Dataset_A',
-          img_dir = 'img_dir',
-          ann_dir = ['anno_dir_1', 'anno_dir_2'],
-          pipeline=train_pipeline
-      )
-      ```
-
-   2. You may concatenate two `split`.
-
-      ```python
-      dataset_A_train = dict(
-          type='Dataset_A',
-          img_dir = 'img_dir',
-          ann_dir = 'anno_dir',
-          split = ['split_1.txt', 'split_2.txt'],
-          pipeline=train_pipeline
-      )
-      ```
-
-   3. You may concatenate two `ann_dir` and `split` simultaneously.
-
-      ```python
-      dataset_A_train = dict(
-          type='Dataset_A',
-          img_dir = 'img_dir',
-          ann_dir = ['anno_dir_1', 'anno_dir_2'],
-          split = ['split_1.txt', 'split_2.txt'],
-          pipeline=train_pipeline
-      )
-      ```
-
-      In this case, `ann_dir_1` and `ann_dir_2` are corresponding to `split_1.txt` and `split_2.txt`.
-
-2. In case the dataset you want to concatenate is different, you can concatenate the dataset configs like the following.
-
-   ```python
-   dataset_A_train = dict()
-   dataset_B_train = dict()
-
-   data = dict(
-       imgs_per_gpu=2,
-       workers_per_gpu=2,
-       train = [
-           dataset_A_train,
-           dataset_B_train
-       ],
-       val = dataset_A_val,
-       test = dataset_A_test
-       )
-   ```
-
-A more complex example that repeats `Dataset_A` and `Dataset_B` by N and M times, respectively, and then concatenates the repeated datasets is as the following.
-
-```python
-dataset_A_train = dict(
-    type='RepeatDataset',
-    times=N,
-    dataset=dict(
-        type='Dataset_A',
-        ...
-        pipeline=train_pipeline
-    )
-)
-dataset_A_val = dict(
-    ...
-    pipeline=test_pipeline
-)
-dataset_A_test = dict(
-    ...
-    pipeline=test_pipeline
-)
-dataset_B_train = dict(
-    type='RepeatDataset',
-    times=M,
-    dataset=dict(
-        type='Dataset_B',
-        ...
-        pipeline=train_pipeline
-    )
-)
-data = dict(
-    imgs_per_gpu=2,
-    workers_per_gpu=2,
-    train = [
-        dataset_A_train,
-        dataset_B_train
-    ],
-    val = dataset_A_val,
-    test = dataset_A_test
-)
-
-```
-
-### Multi-image Mix Dataset
-
-We use `MultiImageMixDataset` as a wrapper to mix images from multiple datasets.
-`MultiImageMixDataset` can be used by multiple images mixed data augmentation
-like mosaic and mixup.
-
-An example of using `MultiImageMixDataset` with `Mosaic` data augmentation:
-
-```python
-train_pipeline = [
-    dict(type='RandomMosaic', prob=1),
-    dict(type='Resize', img_scale=(1024, 512), keep_ratio=True),
-    dict(type='RandomFlip', prob=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
-]
-
-train_dataset = dict(
-    type='MultiImageMixDataset',
-    dataset=dict(
-        classes=classes,
-        palette=palette,
-        type=dataset_type,
-        reduce_zero_label=False,
-        img_dir=data_root + "images/train",
-        ann_dir=data_root + "annotations/train",
-        pipeline=[
-            dict(type='LoadImageFromFile'),
-            dict(type='LoadAnnotations'),
-        ]
-    ),
-    pipeline=train_pipeline
-)
-
-```
diff --git a/docs/en/tutorials/customize_models.md b/docs/en/tutorials/customize_models.md
deleted file mode 100644
index 3fc4d08f48..0000000000
--- a/docs/en/tutorials/customize_models.md
+++ /dev/null
@@ -1,234 +0,0 @@
-# Tutorial 4: Customize Models
-
-## Customize optimizer
-
-Assume you want to add a optimizer named as `MyOptimizer`, which has arguments `a`, `b`, and `c`.
-You need to first implement the new optimizer in a file, e.g., in `mmseg/core/optimizer/my_optimizer.py`:
-
-```python
-from mmcv.runner import OPTIMIZERS
-from torch.optim import Optimizer
-
-
-@OPTIMIZERS.register_module
-class MyOptimizer(Optimizer):
-
-    def __init__(self, a, b, c)
-
-```
-
-Then add this module in `mmseg/core/optimizer/__init__.py` thus the registry will
-find the new module and add it:
-
-```python
-from .my_optimizer import MyOptimizer
-```
-
-Then you can use `MyOptimizer` in `optimizer` field of config files.
-In the configs, the optimizers are defined by the field `optimizer` like the following:
-
-```python
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-```
-
-To use your own optimizer, the field can be changed as
-
-```python
-optimizer = dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)
-```
-
-We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field of config files.
-For example, if you want to use `ADAM`, though the performance will drop a lot, the modification could be as the following.
-
-```python
-optimizer = dict(type='Adam', lr=0.0003, weight_decay=0.0001)
-```
-
-The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch.
-
-## Customize optimizer constructor
-
-Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNoarm layers.
-The users can do those fine-grained parameter tuning through customizing optimizer constructor.
-
-```
-from mmcv.utils import build_from_cfg
-
-from mmcv.runner import OPTIMIZER_BUILDERS
-from .cocktail_optimizer import CocktailOptimizer
-
-
-@OPTIMIZER_BUILDERS.register_module
-class CocktailOptimizerConstructor(object):
-
-    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
-
-    def __call__(self, model):
-
-        return my_optimizer
-
-```
-
-## Develop new components
-
-There are mainly 2 types of components in MMSegmentation.
-
-- backbone: usually stacks of convolutional network to extract feature maps, e.g., ResNet, HRNet.
-- head: the component for semantic segmentation map decoding.
-
-### Add new backbones
-
-Here we show how to develop new components with an example of MobileNet.
-
-1. Create a new file `mmseg/models/backbones/mobilenet.py`.
-
-```python
-import torch.nn as nn
-
-from ..registry import BACKBONES
-
-
-@BACKBONES.register_module
-class MobileNet(nn.Module):
-
-    def __init__(self, arg1, arg2):
-        pass
-
-    def forward(self, x):  # should return a tuple
-        pass
-
-    def init_weights(self, pretrained=None):
-        pass
-```
-
-2. Import the module in `mmseg/models/backbones/__init__.py`.
-
-```python
-from .mobilenet import MobileNet
-```
-
-3. Use it in your config file.
-
-```python
-model = dict(
-    ...
-    backbone=dict(
-        type='MobileNet',
-        arg1=xxx,
-        arg2=xxx),
-    ...
-```
-
-### Add new heads
-
-In MMSegmentation, we provide a base [BaseDecodeHead](https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/decode_head.py) for all segmentation head.
-All newly implemented decode heads should be derived from it.
-Here we show how to develop a new head with the example of [PSPNet](https://arxiv.org/abs/1612.01105) as the following.
-
-First, add a new decode head in `mmseg/models/decode_heads/psp_head.py`.
-PSPNet implements a decode head for segmentation decode.
-To implement a decode head, basically we need to implement three functions of the new module as the following.
-
-```python
-@HEADS.register_module()
-class PSPHead(BaseDecodeHead):
-
-    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
-        super(PSPHead, self).__init__(**kwargs)
-
-    def init_weights(self):
-
-    def forward(self, inputs):
-
-```
-
-Next, the users need to add the module in the `mmseg/models/decode_heads/__init__.py` thus the corresponding registry could find and load them.
-
-To config file of PSPNet is as the following
-
-```python
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='pretrain_model/resnet50_v1c_trick-2cccc1ad.pth',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='PSPHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        pool_scales=(1, 2, 3, 6),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
-
-```
-
-### Add new loss
-
-Assume you want to add a new loss as `MyLoss` for segmentation decode.
-To add a new loss function, the users need implement it in `mmseg/models/losses/my_loss.py`.
-The decorator `weighted_loss` enable the loss to be weighted for each element.
-
-```python
-import torch
-import torch.nn as nn
-
-from ..builder import LOSSES
-from .utils import weighted_loss
-
-@weighted_loss
-def my_loss(pred, target):
-    assert pred.size() == target.size() and target.numel() > 0
-    loss = torch.abs(pred - target)
-    return loss
-
-@LOSSES.register_module
-class MyLoss(nn.Module):
-
-    def __init__(self, reduction='mean', loss_weight=1.0):
-        super(MyLoss, self).__init__()
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self,
-                pred,
-                target,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None):
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        loss = self.loss_weight * my_loss(
-            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
-        return loss
-```
-
-Then the users need to add it in the `mmseg/models/losses/__init__.py`.
-
-```python
-from .my_loss import MyLoss, my_loss
-
-```
-
-To use it, modify the `loss_xxx` field.
-Then you need to modify the `loss_decode` field in the head.
-`loss_weight` could be used to balance multiple losses.
-
-```python
-loss_decode=dict(type='MyLoss', loss_weight=1.0))
-```
diff --git a/docs/en/tutorials/customize_runtime.md b/docs/en/tutorials/customize_runtime.md
deleted file mode 100644
index 87fda76f6f..0000000000
--- a/docs/en/tutorials/customize_runtime.md
+++ /dev/null
@@ -1,245 +0,0 @@
-# Tutorial 6: Customize Runtime Settings
-
-## Customize optimization settings
-
-### Customize optimizer supported by Pytorch
-
-We already support to use all the optimizers implemented by PyTorch, and the only modification is to change the `optimizer` field of config files.
-For example, if you want to use `ADAM` (note that the performance could drop a lot), the modification could be as the following.
-
-```python
-optimizer = dict(type='Adam', lr=0.0003, weight_decay=0.0001)
-```
-
-To modify the learning rate of the model, the users only need to modify the `lr` in the config of optimizer. The users can directly set arguments following the [API doc](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) of PyTorch.
-
-### Customize self-implemented optimizer
-
-#### 1. Define a new optimizer
-
-A customized optimizer could be defined as following.
-
-Assume you want to add a optimizer named `MyOptimizer`, which has arguments `a`, `b`, and `c`.
-You need to create a new directory named `mmseg/core/optimizer`.
-And then implement the new optimizer in a file, e.g., in `mmseg/core/optimizer/my_optimizer.py`:
-
-```python
-from .registry import OPTIMIZERS
-from torch.optim import Optimizer
-
-
-@OPTIMIZERS.register_module()
-class MyOptimizer(Optimizer):
-
-    def __init__(self, a, b, c)
-
-```
-
-#### 2. Add the optimizer to registry
-
-To find the above module defined above, this module should be imported into the main namespace at first. There are two options to achieve it.
-
-- Modify `mmseg/core/optimizer/__init__.py` to import it.
-
-  The newly defined module should be imported in `mmseg/core/optimizer/__init__.py` so that the registry will
-  find the new module and add it:
-
-```python
-from .my_optimizer import MyOptimizer
-```
-
-- Use `custom_imports` in the config to manually import it
-
-```python
-custom_imports = dict(imports=['mmseg.core.optimizer.my_optimizer'], allow_failed_imports=False)
-```
-
-The module `mmseg.core.optimizer.my_optimizer` will be imported at the beginning of the program and the class `MyOptimizer` is then automatically registered.
-Note that only the package containing the class `MyOptimizer` should be imported.
-`mmseg.core.optimizer.my_optimizer.MyOptimizer` **cannot** be imported directly.
-
-Actually users can use a totally different file directory structure using this importing method, as long as the module root can be located in `PYTHONPATH`.
-
-#### 3. Specify the optimizer in the config file
-
-Then you can use `MyOptimizer` in `optimizer` field of config files.
-In the configs, the optimizers are defined by the field `optimizer` like the following:
-
-```python
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-```
-
-To use your own optimizer, the field can be changed to
-
-```python
-optimizer = dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)
-```
-
-### Customize optimizer constructor
-
-Some models may have some parameter-specific settings for optimization, e.g. weight decay for BatchNorm layers.
-The users can do those fine-grained parameter tuning through customizing optimizer constructor.
-
-```python
-from mmcv.utils import build_from_cfg
-
-from mmcv.runner.optimizer import OPTIMIZER_BUILDERS, OPTIMIZERS
-from mmseg.utils import get_root_logger
-from .my_optimizer import MyOptimizer
-
-
-@OPTIMIZER_BUILDERS.register_module()
-class MyOptimizerConstructor(object):
-
-    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
-
-    def __call__(self, model):
-
-        return my_optimizer
-
-```
-
-The default optimizer constructor is implemented [here](https://github.com/open-mmlab/mmcv/blob/9ecd6b0d5ff9d2172c49a182eaa669e9f27bb8e7/mmcv/runner/optimizer/default_constructor.py#L11), which could also serve as a template for new optimizer constructor.
-
-### Additional settings
-
-Tricks not implemented by the optimizer should be implemented through optimizer constructor (e.g., set parameter-wise learning rates) or hooks. We list some common settings that could stabilize the training or accelerate the training. Feel free to create PR, issue for more settings.
-
-- __Use gradient clip to stabilize training__:
-  Some models need gradient clip to clip the gradients to stabilize the training process. An example is as below:
-
-  ```python
-  optimizer_config = dict(
-      _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-  ```
-
-  If your config inherits the base config which already sets the `optimizer_config`, you might need `_delete_=True` to override the unnecessary settings. See the [config documentation](https://mmsegmentation.readthedocs.io/en/latest/config.html) for more details.
-
-- __Use momentum schedule to accelerate model convergence__:
-  We support momentum scheduler to modify model's momentum according to learning rate, which could make the model converge in a faster way.
-  Momentum scheduler is usually used with LR scheduler, for example, the following config is used in 3D detection to accelerate convergence.
-  For more details, please refer to the implementation of [CyclicLrUpdater](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327) and [CyclicMomentumUpdater](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130).
-
-  ```python
-  lr_config = dict(
-      policy='cyclic',
-      target_ratio=(10, 1e-4),
-      cyclic_times=1,
-      step_ratio_up=0.4,
-  )
-  momentum_config = dict(
-      policy='cyclic',
-      target_ratio=(0.85 / 0.95, 1),
-      cyclic_times=1,
-      step_ratio_up=0.4,
-  )
-  ```
-
-## Customize training schedules
-
-By default we use step learning rate with 40k/80k schedule, this calls [`PolyLrUpdaterHook`](https://github.com/open-mmlab/mmcv/blob/826d3a7b68596c824fa1e2cb89b6ac274f52179c/mmcv/runner/hooks/lr_updater.py#L196) in MMCV.
-We support many other learning rate schedule [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py), such as `CosineAnnealing` and `Poly` schedule. Here are some examples
-
-- Step schedule:
-
-  ```python
-  lr_config = dict(policy='step', step=[9, 10])
-  ```
-
-- ConsineAnnealing schedule:
-
-  ```python
-  lr_config = dict(
-      policy='CosineAnnealing',
-      warmup='linear',
-      warmup_iters=1000,
-      warmup_ratio=1.0 / 10,
-      min_lr_ratio=1e-5)
-  ```
-
-## Customize workflow
-
-Workflow is a list of (phase, epochs) to specify the running order and epochs.
-By default it is set to be
-
-```python
-workflow = [('train', 1)]
-```
-
-which means running 1 epoch for training.
-Sometimes user may want to check some metrics (e.g. loss, accuracy) about the model on the validate set.
-In such case, we can set the workflow as
-
-```python
-[('train', 1), ('val', 1)]
-```
-
-so that 1 epoch for training and 1 epoch for validation will be run iteratively.
-
-:::{note}
-
-1. The parameters of model will not be updated during val epoch.
-2. Keyword `total_epochs` in the config only controls the number of training epochs and will not affect the validation workflow.
-3. Workflows `[('train', 1), ('val', 1)]` and `[('train', 1)]` will not change the behavior of `EvalHook` because `EvalHook` is called by `after_train_epoch` and validation workflow only affect hooks that are called through `after_val_epoch`. Therefore, the only difference between `[('train', 1), ('val', 1)]` and `[('train', 1)]` is that the runner will calculate losses on validation set after each training epoch.
-
-:::
-
-## Customize hooks
-
-### Use hooks implemented in MMCV
-
-If the hook is already implemented in MMCV, you can directly modify the config to use the hook as below
-
-```python
-custom_hooks = [
-    dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL')
-]
-```
-
-### Modify default runtime hooks
-
-There are some common hooks that are not registered through `custom_hooks`, they are
-
-- log_config
-- checkpoint_config
-- evaluation
-- lr_config
-- optimizer_config
-- momentum_config
-
-In those hooks, only the logger hook has the `VERY_LOW` priority, others' priority are `NORMAL`.
-The above-mentioned tutorials already covers how to modify `optimizer_config`, `momentum_config`, and `lr_config`.
-Here we reveals how what we can do with `log_config`, `checkpoint_config`, and `evaluation`.
-
-#### Checkpoint config
-
-The MMCV runner will use `checkpoint_config` to initialize [`CheckpointHook`](https://github.com/open-mmlab/mmcv/blob/9ecd6b0d5ff9d2172c49a182eaa669e9f27bb8e7/mmcv/runner/hooks/checkpoint.py#L9).
-
-```python
-checkpoint_config = dict(interval=1)
-```
-
-The users could set `max_keep_ckpts` to only save only small number of checkpoints or decide whether to store state dict of optimizer by `save_optimizer`. More details of the arguments are [here](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.CheckpointHook)
-
-#### Log config
-
-The `log_config` wraps multiple logger hooks and enables to set intervals. Now MMCV supports `WandbLoggerHook`, `MlflowLoggerHook`, and `TensorboardLoggerHook`.
-The detail usages can be found in the [doc](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook).
-
-```python
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-```
-
-#### Evaluation config
-
-The config of `evaluation` will be used to initialize the [`EvalHook`](https://github.com/open-mmlab/mmsegmentation/blob/e3f6f655d69b777341aec2fe8829871cc0beadcb/mmseg/core/evaluation/eval_hooks.py#L7).
-Except the key `interval`, other arguments such as `metric` will be passed to the `dataset.evaluate()`
-
-```python
-evaluation = dict(interval=1, metric='mIoU')
-```
diff --git a/docs/en/tutorials/data_pipeline.md b/docs/en/tutorials/data_pipeline.md
deleted file mode 100644
index ffa5855039..0000000000
--- a/docs/en/tutorials/data_pipeline.md
+++ /dev/null
@@ -1,171 +0,0 @@
-# Tutorial 3: Customize Data Pipelines
-
-## Design of Data pipelines
-
-Following typical conventions, we use `Dataset` and `DataLoader` for data loading
-with multiple workers. `Dataset` returns a dict of data items corresponding
-the arguments of models' forward method.
-Since the data in semantic segmentation may not be the same size,
-we introduce a new `DataContainer` type in MMCV to help collect and distribute
-data of different size.
-See [here](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) for more details.
-
-The data preparation pipeline and the dataset is decomposed. Usually a dataset
-defines how to process the annotations and a data pipeline defines all the steps to prepare a data dict.
-A pipeline consists of a sequence of operations. Each operation takes a dict as input and also output a dict for the next transform.
-
-The operations are categorized into data loading, pre-processing, formatting and test-time augmentation.
-
-Here is an pipeline example for PSPNet.
-
-```python
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-crop_size = (512, 1024)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations'),
-    dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
-    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='PhotoMetricDistortion'),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(2048, 1024),
-        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-```
-
-For each operation, we list the related dict fields that are added/updated/removed.
-
-### Data loading
-
-`LoadImageFromFile`
-
-- add: img, img_shape, ori_shape
-
-`LoadAnnotations`
-
-- add: gt_semantic_seg, seg_fields
-
-### Pre-processing
-
-`Resize`
-
-- add: scale, scale_idx, pad_shape, scale_factor, keep_ratio
-- update: img, img_shape, \*seg_fields
-
-`RandomFlip`
-
-- add: flip
-- update: img, \*seg_fields
-
-`Pad`
-
-- add: pad_fixed_size, pad_size_divisor
-- update: img, pad_shape, \*seg_fields
-
-`RandomCrop`
-
-- update: img, pad_shape, \*seg_fields
-
-`Normalize`
-
-- add: img_norm_cfg
-- update: img
-
-`SegRescale`
-
-- update: gt_semantic_seg
-
-`PhotoMetricDistortion`
-
-- update: img
-
-### Formatting
-
-`ToTensor`
-
-- update: specified by `keys`.
-
-`ImageToTensor`
-
-- update: specified by `keys`.
-
-`Transpose`
-
-- update: specified by `keys`.
-
-`ToDataContainer`
-
-- update: specified by `fields`.
-
-`DefaultFormatBundle`
-
-- update: img, gt_semantic_seg
-
-`Collect`
-
-- add: img_meta (the keys of img_meta is specified by `meta_keys`)
-- remove: all other keys except for those specified by `keys`
-
-### Test time augmentation
-
-`MultiScaleFlipAug`
-
-## Extend and use custom pipelines
-
-1. Write a new pipeline in any file, e.g., `my_pipeline.py`. It takes a dict as input and return a dict.
-
-   ```python
-   from mmseg.datasets import PIPELINES
-
-   @PIPELINES.register_module()
-   class MyTransform:
-
-       def __call__(self, results):
-           results['dummy'] = True
-           return results
-   ```
-
-2. Import the new class.
-
-   ```python
-   from .my_pipeline import MyTransform
-   ```
-
-3. Use it in config files.
-
-   ```python
-   img_norm_cfg = dict(
-       mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-   crop_size = (512, 1024)
-   train_pipeline = [
-       dict(type='LoadImageFromFile'),
-       dict(type='LoadAnnotations'),
-       dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
-       dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
-       dict(type='RandomFlip', flip_ratio=0.5),
-       dict(type='PhotoMetricDistortion'),
-       dict(type='Normalize', **img_norm_cfg),
-       dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
-       dict(type='MyTransform'),
-       dict(type='DefaultFormatBundle'),
-       dict(type='Collect', keys=['img', 'gt_semantic_seg']),
-   ]
-   ```
diff --git a/docs/en/tutorials/index.rst b/docs/en/tutorials/index.rst
deleted file mode 100644
index e1a67a8b44..0000000000
--- a/docs/en/tutorials/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-.. toctree::
-   :maxdepth: 2
-
-   config.md
-   customize_datasets.md
-   data_pipeline.md
-   customize_models.md
-   training_tricks.md
-   customize_runtime.md
diff --git a/docs/en/tutorials/training_tricks.md b/docs/en/tutorials/training_tricks.md
deleted file mode 100644
index d40de3d751..0000000000
--- a/docs/en/tutorials/training_tricks.md
+++ /dev/null
@@ -1,90 +0,0 @@
-# Tutorial 5: Training Tricks
-
-MMSegmentation support following training tricks out of box.
-
-## Different Learning Rate(LR) for Backbone and Heads
-
-In semantic segmentation, some methods make the LR of heads larger than backbone to achieve better performance or faster convergence.
-
-In MMSegmentation, you may add following lines to config to make the LR of heads 10 times of backbone.
-
-```python
-optimizer=dict(
-    paramwise_cfg = dict(
-        custom_keys={
-            'head': dict(lr_mult=10.)}))
-```
-
-With this modification, the LR of any parameter group with `'head'` in name will be multiplied by 10.
-You may refer to [MMCV doc](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.DefaultOptimizerConstructor) for further details.
-
-## Online Hard Example Mining (OHEM)
-
-We implement pixel sampler [here](https://github.com/open-mmlab/mmsegmentation/tree/master/mmseg/core/seg/sampler) for training sampling.
-Here is an example config of training PSPNet with OHEM enabled.
-
-```python
-_base_ = './pspnet_r50-d8_512x1024_40k_cityscapes.py'
-model=dict(
-    decode_head=dict(
-        sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=100000)) )
-```
-
-In this way, only pixels with confidence score under 0.7 are used to train. And we keep at least 100000 pixels during training. If `thresh` is not specified, pixels of top `min_kept` loss will be selected.
-
-## Class Balanced Loss
-
-For dataset that is not balanced in classes distribution, you may change the loss weight of each class.
-Here is an example for cityscapes dataset.
-
-```python
-_base_ = './pspnet_r50-d8_512x1024_40k_cityscapes.py'
-model=dict(
-    decode_head=dict(
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0,
-            # DeepLab used this class weight for cityscapes
-            class_weight=[0.8373, 0.9180, 0.8660, 1.0345, 1.0166, 0.9969, 0.9754,
-                        1.0489, 0.8786, 1.0023, 0.9539, 0.9843, 1.1116, 0.9037,
-                        1.0865, 1.0955, 1.0865, 1.1529, 1.0507])))
-```
-
-`class_weight` will be passed into `CrossEntropyLoss` as `weight` argument. Please refer to [PyTorch Doc](https://pytorch.org/docs/stable/nn.html?highlight=crossentropy#torch.nn.CrossEntropyLoss) for details.
-
-## Multiple Losses
-
-For loss calculation, we support multiple losses training concurrently. Here is an example config of training `unet` on `DRIVE` dataset, whose loss function is `1:3` weighted sum of `CrossEntropyLoss` and `DiceLoss`:
-
-```python
-_base_ = './fcn_unet_s5-d16_64x64_40k_drive.py'
-model = dict(
-    decode_head=dict(loss_decode=[dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-            dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)]),
-    auxiliary_head=dict(loss_decode=[dict(type='CrossEntropyLoss', loss_name='loss_ce',loss_weight=1.0),
-            dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)]),
-    )
-```
-
-In this way, `loss_weight` and `loss_name` will be weight and name in training log of corresponding loss, respectively.
-
-Note: If you want this loss item to be included into the backward graph, `loss_` must be the prefix of the name.
-
-## Ignore specified label index in loss calculation
-
-In default setting, `avg_non_ignore=False` which means each pixel counts for loss calculation although some of them belong to ignore-index labels.
-
-For loss calculation, we support ignore index of certain label by `avg_non_ignore` and `ignore_index`. In this way, the average loss would only be calculated in non-ignored labels which may achieve better performance, and here is the [reference](https://github.com/open-mmlab/mmsegmentation/pull/1409). Here is an example config of training `unet` on `Cityscapes` dataset: in loss calculation it would ignore label 0 which is background and loss average is only calculated on non-ignore labels:
-
-```python
-_base_ = './fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes.py'
-model = dict(
-    decode_head=dict(
-        ignore_index=0,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, avg_non_ignore=True),
-    auxiliary_head=dict(
-        ignore_index=0,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, avg_non_ignore=True)),
-    ))
-```
diff --git a/docs/en/useful_tools.md b/docs/en/useful_tools.md
deleted file mode 100644
index 6da2de5117..0000000000
--- a/docs/en/useful_tools.md
+++ /dev/null
@@ -1,426 +0,0 @@
-## Useful tools
-
-Apart from training/testing scripts, We provide lots of useful tools under the
-`tools/` directory.
-
-### Get the FLOPs and params (experimental)
-
-We provide a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) to compute the FLOPs and params of a given model.
-
-```shell
-python tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
-```
-
-You will get the result like this.
-
-```none
-==============================
-Input shape: (3, 2048, 1024)
-Flops: 1429.68 GMac
-Params: 48.98 M
-==============================
-```
-
-:::{note}
-This tool is still experimental and we do not guarantee that the number is correct. You may well use the result for simple comparisons, but double check it before you adopt it in technical reports or papers.
-:::
-
-(1) FLOPs are related to the input shape while parameters are not. The default input shape is (1, 3, 1280, 800).
-(2) Some operators are not counted into FLOPs like GN and custom operators.
-
-### Publish a model
-
-Before you upload a model to AWS, you may want to
-(1) convert model weights to CPU tensors, (2) delete the optimizer states and
-(3) compute the hash of the checkpoint file and append the hash id to the filename.
-
-```shell
-python tools/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
-```
-
-E.g.,
-
-```shell
-python tools/publish_model.py work_dirs/pspnet/latest.pth psp_r50_hszhao_200ep.pth
-```
-
-The final output filename will be `psp_r50_512x1024_40ki_cityscapes-{hash id}.pth`.
-
-### Convert to ONNX (experimental)
-
-We provide a script to convert model to [ONNX](https://github.com/onnx/onnx) format. The converted model could be visualized by tools like [Netron](https://github.com/lutzroeder/netron). Besides, we also support comparing the output results between PyTorch and ONNX model.
-
-```bash
-python tools/pytorch2onnx.py \
-    ${CONFIG_FILE} \
-    --checkpoint ${CHECKPOINT_FILE} \
-    --output-file ${ONNX_FILE} \
-    --input-img ${INPUT_IMG} \
-    --shape ${INPUT_SHAPE} \
-    --rescale-shape ${RESCALE_SHAPE} \
-    --show \
-    --verify \
-    --dynamic-export \
-    --cfg-options \
-      model.test_cfg.mode="whole"
-```
-
-Description of arguments:
-
-- `config` : The path of a model config file.
-- `--checkpoint` : The path of a model checkpoint file.
-- `--output-file`: The path of output ONNX model. If not specified, it will be set to `tmp.onnx`.
-- `--input-img` : The path of an input image for conversion and visualize.
-- `--shape`: The height and width of input tensor to the model. If not specified, it will be set to img_scale of test_pipeline.
-- `--rescale-shape`: rescale shape of output, set this value to avoid OOM, only work on `slide` mode.
-- `--show`: Determines whether to print the architecture of the exported model. If not specified, it will be set to `False`.
-- `--verify`: Determines whether to verify the correctness of an exported model. If not specified, it will be set to `False`.
-- `--dynamic-export`: Determines whether to export ONNX model with dynamic input and output shapes. If not specified, it will be set to `False`.
-- `--cfg-options`:Update config options.
-
-:::{note}
-This tool is still experimental. Some customized operators are not supported for now.
-:::
-
-### Evaluate ONNX model
-
-We provide `tools/deploy_test.py` to evaluate ONNX model with different backend.
-
-#### Prerequisite
-
-- Install onnx and onnxruntime-gpu
-
-  ```shell
-  pip install onnx onnxruntime-gpu
-  ```
-
-- Install TensorRT following [how-to-build-tensorrt-plugins-in-mmcv](https://mmcv.readthedocs.io/en/latest/tensorrt_plugin.html#how-to-build-tensorrt-plugins-in-mmcv)(optional)
-
-#### Usage
-
-```bash
-python tools/deploy_test.py \
-    ${CONFIG_FILE} \
-    ${MODEL_FILE} \
-    ${BACKEND} \
-    --out ${OUTPUT_FILE} \
-    --eval ${EVALUATION_METRICS} \
-    --show \
-    --show-dir ${SHOW_DIRECTORY} \
-    --cfg-options ${CFG_OPTIONS} \
-    --eval-options ${EVALUATION_OPTIONS} \
-    --opacity ${OPACITY} \
-```
-
-Description of all arguments
-
-- `config`: The path of a model config file.
-- `model`: The path of a converted model file.
-- `backend`: Backend of the inference, options: `onnxruntime`, `tensorrt`.
-- `--out`: The path of output result file in pickle format.
-- `--format-only` : Format the output results without perform evaluation. It is useful when you want to format the result to a specific format and submit it to the test server. If not specified, it will be set to `False`. Note that this argument is **mutually exclusive** with `--eval`.
-- `--eval`: Evaluation metrics, which depends on the dataset, e.g., "mIoU" for generic datasets, and "cityscapes" for Cityscapes. Note that this argument is **mutually exclusive** with `--format-only`.
-- `--show`: Show results flag.
-- `--show-dir`: Directory where painted images will be saved
-- `--cfg-options`: Override some settings in the used config file, the key-value pair in `xxx=yyy` format will be merged into config file.
-- `--eval-options`: Custom options for evaluation, the key-value pair in `xxx=yyy` format will be kwargs for `dataset.evaluate()` function
-- `--opacity`: Opacity of painted segmentation map. In (0, 1\] range.
-
-#### Results and Models
-
-|   Model    |                     Config                      |  Dataset   | Metric | PyTorch | ONNXRuntime | TensorRT-fp32 | TensorRT-fp16 |
-| :--------: | :---------------------------------------------: | :--------: | :----: | :-----: | :---------: | :-----------: | :-----------: |
-|    FCN     |      fcn_r50-d8_512x1024_40k_cityscapes.py      | cityscapes |  mIoU  |  72.2   |    72.2     |     72.2      |     72.2      |
-|   PSPNet   |    pspnet_r50-d8_512x1024_40k_cityscapes.py     | cityscapes |  mIoU  |  77.8   |    77.8     |     77.8      |     77.8      |
-| deeplabv3  |   deeplabv3_r50-d8_512x1024_40k_cityscapes.py   | cityscapes |  mIoU  |  79.0   |    79.0     |     79.0      |     79.0      |
-| deeplabv3+ | deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py | cityscapes |  mIoU  |  79.6   |    79.5     |     79.5      |     79.5      |
-|   PSPNet   |     pspnet_r50-d8_769x769_40k_cityscapes.py     | cityscapes |  mIoU  |  78.2   |    78.1     |               |               |
-| deeplabv3  |   deeplabv3_r50-d8_769x769_40k_cityscapes.py    | cityscapes |  mIoU  |  78.5   |    78.3     |               |               |
-| deeplabv3+ | deeplabv3plus_r50-d8_769x769_40k_cityscapes.py  | cityscapes |  mIoU  |  78.9   |    78.7     |               |               |
-
-:::{note}
-TensorRT is only available on configs with `whole mode`.
-:::
-
-### Convert to TorchScript (experimental)
-
-We also provide a script to convert model to [TorchScript](https://pytorch.org/docs/stable/jit.html) format. You can use the pytorch C++ API [LibTorch](https://pytorch.org/docs/stable/cpp_index.html) inference the trained model. The converted model could be visualized by tools like [Netron](https://github.com/lutzroeder/netron). Besides, we also support comparing the output results between PyTorch and TorchScript model.
-
-```shell
-python tools/pytorch2torchscript.py \
-    ${CONFIG_FILE} \
-    --checkpoint ${CHECKPOINT_FILE} \
-    --output-file ${ONNX_FILE}
-    --shape ${INPUT_SHAPE}
-    --verify \
-    --show
-```
-
-Description of arguments:
-
-- `config` : The path of a pytorch model config file.
-- `--checkpoint` : The path of a pytorch model checkpoint file.
-- `--output-file`: The path of output TorchScript model. If not specified, it will be set to `tmp.pt`.
-- `--input-img` : The path of an input image for conversion and visualize.
-- `--shape`: The height and width of input tensor to the model. If not specified, it will be set to `512 512`.
-- `--show`: Determines whether to print the traced graph of the exported model. If not specified, it will be set to `False`.
-- `--verify`: Determines whether to verify the correctness of an exported model. If not specified, it will be set to `False`.
-
-:::{note}
-It's only support PyTorch>=1.8.0 for now.
-:::
-
-:::{note}
-This tool is still experimental. Some customized operators are not supported for now.
-:::
-
-Examples:
-
-- Convert the cityscapes PSPNet pytorch model.
-
-  ```shell
-  python tools/pytorch2torchscript.py configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-  --checkpoint checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-  --output-file checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pt \
-  --shape 512 1024
-  ```
-
-### Convert to TensorRT (experimental)
-
-A script to convert [ONNX](https://github.com/onnx/onnx) model to [TensorRT](https://developer.nvidia.com/tensorrt) format.
-
-Prerequisite
-
-- install `mmcv-full` with ONNXRuntime custom ops and TensorRT plugins follow [ONNXRuntime in mmcv](https://mmcv.readthedocs.io/en/latest/deployment/onnxruntime_op.html) and [TensorRT plugin in mmcv](https://github.com/open-mmlab/mmcv/blob/master/docs/en/deployment/tensorrt_plugin.md).
-- Use [pytorch2onnx](#convert-to-onnx-experimental) to convert the model from PyTorch to ONNX.
-
-Usage
-
-```bash
-python ${MMSEG_PATH}/tools/onnx2tensorrt.py \
-    ${CFG_PATH} \
-    ${ONNX_PATH} \
-    --trt-file ${OUTPUT_TRT_PATH} \
-    --min-shape ${MIN_SHAPE} \
-    --max-shape ${MAX_SHAPE} \
-    --input-img ${INPUT_IMG} \
-    --show \
-    --verify
-```
-
-Description of all arguments
-
-- `config` : Config file of the model.
-- `model` : Path to the input ONNX model.
-- `--trt-file` : Path to the output TensorRT engine.
-- `--max-shape` : Maximum shape of model input.
-- `--min-shape` : Minimum shape of model input.
-- `--fp16` : Enable fp16 model conversion.
-- `--workspace-size` : Max workspace size in GiB.
-- `--input-img` : Image for visualize.
-- `--show` : Enable result visualize.
-- `--dataset` : Palette provider, `CityscapesDataset` as default.
-- `--verify` : Verify the outputs of ONNXRuntime and TensorRT.
-- `--verbose` : Whether to verbose logging messages while creating TensorRT engine. Defaults to False.
-
-:::{note}
-Only tested on whole mode.
-:::
-
-## Miscellaneous
-
-### Print the entire config
-
-`tools/print_config.py` prints the whole config verbatim, expanding all its
-imports.
-
-```shell
-python tools/print_config.py \
-  ${CONFIG} \
-  --graph \
-  --cfg-options ${OPTIONS [OPTIONS...]} \
-```
-
-Description of arguments:
-
-- `config` : The path of a pytorch model config file.
-- `--graph` : Determines whether to print the models graph.
-- `--cfg-options`: Custom options to replace the config file.
-
-### Plot training logs
-
-`tools/analyze_logs.py` plots loss/mIoU curves given a training log file. `pip install seaborn` first to install the dependency.
-
-```shell
-python tools/analyze_logs.py xxx.log.json [--keys ${KEYS}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}]
-```
-
-Examples:
-
-- Plot the mIoU, mAcc, aAcc metrics.
-
-  ```shell
-  python tools/analyze_logs.py log.json --keys mIoU mAcc aAcc --legend mIoU mAcc aAcc
-  ```
-
-- Plot loss metric.
-
-  ```shell
-  python tools/analyze_logs.py log.json --keys loss --legend loss
-  ```
-
-### Model conversion
-
-`tools/model_converters/` provide several scripts to convert pretrain models released by other repos to MMSegmentation style.
-
-#### ViT Swin MiT Transformer Models
-
-- ViT
-
-  `tools/model_converters/vit2mmseg.py` convert keys in timm pretrained vit models to MMSegmentation style.
-
-  ```shell
-  python tools/model_converters/vit2mmseg.py ${SRC} ${DST}
-  ```
-
-- Swin
-
-  `tools/model_converters/swin2mmseg.py` convert keys in official pretrained swin models to MMSegmentation style.
-
-  ```shell
-  python tools/model_converters/swin2mmseg.py ${SRC} ${DST}
-  ```
-
-- SegFormer
-
-  `tools/model_converters/mit2mmseg.py` convert keys in official pretrained mit models to MMSegmentation style.
-
-  ```shell
-  python tools/model_converters/mit2mmseg.py ${SRC} ${DST}
-  ```
-
-## Model Serving
-
-In order to serve an `MMSegmentation` model with [`TorchServe`](https://pytorch.org/serve/), you can follow the steps:
-
-### 1. Convert model from MMSegmentation to TorchServe
-
-```shell
-python tools/torchserve/mmseg2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \
---output-folder ${MODEL_STORE} \
---model-name ${MODEL_NAME}
-```
-
-:::{note}
-${MODEL_STORE} needs to be an absolute path to a folder.
-:::
-
-### 2. Build `mmseg-serve` docker image
-
-```shell
-docker build -t mmseg-serve:latest docker/serve/
-```
-
-### 3. Run `mmseg-serve`
-
-Check the official docs for [running TorchServe with docker](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment).
-
-In order to run in GPU, you need to install [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). You can omit the `--gpus` argument in order to run in CPU.
-
-Example:
-
-```shell
-docker run --rm \
---cpus 8 \
---gpus device=0 \
--p8080:8080 -p8081:8081 -p8082:8082 \
---mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \
-mmseg-serve:latest
-```
-
-[Read the docs](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md) about the Inference (8080), Management (8081) and Metrics (8082) APIs
-
-### 4. Test deployment
-
-```shell
-curl -O https://raw.githubusercontent.com/open-mmlab/mmsegmentation/master/resources/3dogs.jpg
-curl http://127.0.0.1:8080/predictions/${MODEL_NAME} -T 3dogs.jpg -o 3dogs_mask.png
-```
-
-The response will be a ".png" mask.
-
-You can visualize the output as follows:
-
-```python
-import matplotlib.pyplot as plt
-import mmcv
-plt.imshow(mmcv.imread("3dogs_mask.png", "grayscale"))
-plt.show()
-```
-
-You should see something similar to:
-
-![3dogs_mask](../../resources/3dogs_mask.png)
-
-And you can use `test_torchserve.py` to compare result of torchserve and pytorch, and visualize them.
-
-```shell
-python tools/torchserve/test_torchserve.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME}
-[--inference-addr ${INFERENCE_ADDR}] [--result-image ${RESULT_IMAGE}] [--device ${DEVICE}]
-```
-
-Example:
-
-```shell
-python tools/torchserve/test_torchserve.py \
-demo/demo.png \
-configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py \
-checkpoint/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth \
-fcn
-```
-
-## Confusion Matrix
-
-In order to generate and plot a `nxn` confusion matrix where `n` is the number of classes, you can follow the steps:
-
-### 1.Generate a prediction result in pkl format using `test.py`
-
-```shell
-python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${PATH_TO_RESULT_FILE}]
-```
-
-Note that the argument for `--eval` should be  `None` so that the result file contains numpy type of prediction results. The usage for distribution test is just the same.
-
-Example:
-
-```shell
-python tools/test.py \
-configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py \
-checkpoint/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth \
---out result/pred_result.pkl
-```
-
-### 2. Use `confusion_matrix.py` to generate and plot a confusion matrix
-
-```shell
-python tools/confusion_matrix.py ${CONFIG_FILE} ${PATH_TO_RESULT_FILE} ${SAVE_DIR} --show
-```
-
-Description of arguments:
-
-- `config`: Path to the test config file.
-- `prediction_path`: Path to the prediction .pkl result.
-- `save_dir`: Directory where confusion matrix will be saved.
-- `--show`: Enable result visualize.
-- `--color-theme`: Theme of the matrix color map.
-- `--cfg_options`: Custom options to replace the config file.
-
-Example:
-
-```shell
-python tools/confusion_matrix.py \
-configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py \
-result/pred_result.pkl \
-result/confusion_matrix \
---show
-```
diff --git a/docs/en/user_guides/1_config.md b/docs/en/user_guides/1_config.md
new file mode 100644
index 0000000000..291c488e8e
--- /dev/null
+++ b/docs/en/user_guides/1_config.md
@@ -0,0 +1,588 @@
+# Tutorial 1: Learn about Configs
+
+We incorporate modular and inheritance design into our config system, which is convenient to conduct various experiments.
+If you wish to inspect the config file, you may run `python tools/misc/print_config.py /PATH/TO/CONFIG` to see the complete config.
+You may also pass `--cfg-options xxx.yyy=zzz` to see updated config.
+
+## Config File Structure
+
+There are 4 basic component types under `config/_base_`, datasets, models, schedules, default_runtime.
+Many methods could be easily constructed with one of each like DeepLabV3, PSPNet.
+The configs that are composed by components from `_base_` are called _primitive_.
+
+For all configs under the same folder, it is recommended to have only **one** _primitive_ config. All other configs should inherit from the _primitive_ config. In this way, the maximum of inheritance level is 3.
+
+For easy understanding, we recommend contributors to inherit from existing methods.
+For example, if some modification is made base on DeepLabV3, user may first inherit the basic DeepLabV3 structure by specifying `_base_ = ../deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py`, then modify the necessary fields in the config files.
+
+If you are building an entirely new method that does not share the structure with any of the existing methods, you may create a folder `xxxnet` under `configs`,
+
+Please refer to [mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) for detailed documentation.
+
+## Config Name Style
+
+We follow the below style to name config files. Contributors are advised to follow the same style.
+
+```text
+{algorithm name}_{model component names [component1]_[component2]_[...]}_{training settings}_{training dataset information}_{testing dataset information}
+```
+
+The file name is divided to five parts. All parts and components are connected with `_` and words of each part or component should be connected with `-`.
+
+- `{algorithm name}`: The name of the algorithm, such as `deeplabv3`, `pspnet`, etc.
+- `{model component names}`: Names of the components used in the algorithm such as backbone, head, etc. For example, `r50-d8` means using ResNet50 backbone and use output of backbone is 8 times downsampling as input.
+- `{training settings}`: Information of training settings such as batch size, augmentations, loss, learning rate scheduler, and epochs/iterations. For example: `4xb4-ce-linearlr-40K` means using 4-gpus x 4-images-per-gpu, CrossEntropy loss, Linear learning rate scheduler, and train 40K iterations.
+  Some abbreviations:
+  - `{gpu x batch_per_gpu}`: GPUs and samples per GPU. `bN` indicates N batch size per GPU. E.g. `8xb2` is the short term of 8-gpus x 2-images-per-gpu. And `4xb4` is used by default if not mentioned.
+  - `{schedule}`: training schedule, options are `20k`, `40k`, etc. `20k` and `40k` means 20000 iterations and 40000 iterations respectively.
+- `{training dataset information}`: Training dataset names like `cityscapes`, `ade20k`, etc, and input resolutions. For example: `cityscapes-768x768` means training on `cityscapes` dataset and the input shape is `768x768`.
+- `{testing dataset information}` (optional): Testing dataset name for models trained on one dataset but tested on another. If not mentioned, it means the model was trained and tested on the same dataset type.
+
+## An Example of PSPNet
+
+To help the users have a basic idea of a complete config and the modules in a modern semantic segmentation system,
+we make brief comments on the config of PSPNet using ResNet50V1c as the following.
+For more detailed usage and the corresponding alternative for each module, please refer to the API documentation.
+
+```python
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+] # base config file which we build new config file on.
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+```
+
+`_base_/models/pspnet_r50-d8.py` is a basic model cfg file for PSPNet using ResNet50V1c
+
+```python
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)  # Segmentation usually uses SyncBN
+data_preprocessor = dict(  # The config of data preprocessor, usually includes image normalization and augmentation.
+    type='SegDataPreProcessor',  # The type of data preprocessor.
+    mean=[123.675, 116.28, 103.53],  # Mean values used for normalizing the input images.
+    std=[58.395, 57.12, 57.375],  # Standard variance used for normalizing the input images.
+    bgr_to_rgb=True,  # Whether to convert image from BGR to RGB.
+    pad_val=0,  # Padding value of image.
+    seg_pad_val=255)  # Padding value of segmentation map.
+model = dict(
+    type='EncoderDecoder',  # Name of segmentor
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',  # The ImageNet pretrained backbone to be loaded
+    backbone=dict(
+        type='ResNetV1c',  # The type of backbone. Please refer to mmseg/models/backbones/resnet.py for details.
+        depth=50,  # Depth of backbone. Normally 50, 101 are used.
+        num_stages=4,  # Number of stages of backbone.
+        out_indices=(0, 1, 2, 3),  # The index of output feature maps produced in each stages.
+        dilations=(1, 1, 2, 4),  # The dilation rate of each layer.
+        strides=(1, 2, 1, 1),  # The stride of each layer.
+        norm_cfg=norm_cfg,  # The configuration of norm layer.
+        norm_eval=False,  # Whether to freeze the statistics in BN
+        style='pytorch',  # The style of backbone, 'pytorch' means that stride 2 layers are in 3x3 conv, 'caffe' means stride 2 layers are in 1x1 convs.
+        contract_dilation=True),  # When dilation > 1, whether contract first layer of dilation.
+    decode_head=dict(
+        type='PSPHead',  # Type of decode head. Please refer to mmseg/models/decode_heads for available options.
+        in_channels=2048,  # Input channel of decode head.
+        in_index=3,  # The index of feature map to select.
+        channels=512,  # The intermediate channels of decode head.
+        pool_scales=(1, 2, 3, 6),  # The avg pooling scales of PSPHead. Please refer to paper for details.
+        dropout_ratio=0.1,  # The dropout ratio before final classification layer.
+        num_classes=19,  # Number of segmentation class. Usually 19 for cityscapes, 21 for VOC, 150 for ADE20k.
+        norm_cfg=norm_cfg,  # The configuration of norm layer.
+        align_corners=False,  # The align_corners argument for resize in decoding.
+        loss_decode=dict(  # Config of loss function for the decode_head.
+            type='CrossEntropyLoss',  # Type of loss used for segmentation.
+            use_sigmoid=False,  # Whether use sigmoid activation for segmentation.
+            loss_weight=1.0)),  # Loss weight of decode_head.
+    auxiliary_head=dict(
+        type='FCNHead',  # Type of auxiliary head. Please refer to mmseg/models/decode_heads for available options.
+        in_channels=1024,  # Input channel of auxiliary head.
+        in_index=2,  # The index of feature map to select.
+        channels=256,  # The intermediate channels of decode head.
+        num_convs=1,  # Number of convs in FCNHead. It is usually 1 in auxiliary head.
+        concat_input=False,  # Whether concat output of convs with input before classification layer.
+        dropout_ratio=0.1,  # The dropout ratio before final classification layer.
+        num_classes=19,  # Number of segmentation class. Usually 19 for cityscapes, 21 for VOC, 150 for ADE20k.
+        norm_cfg=norm_cfg,  # The configuration of norm layer.
+        align_corners=False,  # The align_corners argument for resize in decoding.
+        loss_decode=dict(  # Config of loss function for the auxiliary_head.
+            type='CrossEntropyLoss',  # Type of loss used for segmentation.
+            use_sigmoid=False,  # Whether use sigmoid activation for segmentation.
+            loss_weight=0.4)),  # Loss weight of auxiliary_head.
+    # model training and testing settings
+    train_cfg=dict(),  # train_cfg is just a place holder for now.
+    test_cfg=dict(mode='whole'))  # The test mode, options are 'whole' and 'slide'. 'whole': whole image fully-convolutional test. 'slide': sliding crop window on the image.
+```
+
+`_base_/datasets/cityscapes.py` is the configuration file of the dataset
+
+```python
+# dataset settings
+dataset_type = 'CityscapesDataset'  # Dataset type, this will be used to define the dataset.
+data_root = 'data/cityscapes/'  # Root path of data.
+crop_size = (512, 1024)  # The crop size during training.
+train_pipeline = [  # Training pipeline.
+    dict(type='LoadImageFromFile'),  # First pipeline to load images from file path.
+    dict(type='LoadAnnotations'),  # Second pipeline to load annotations for current image.
+    dict(type='RandomResize',  # Augmentation pipeline that resize the images and their annotations.
+        scale=(2048, 1024),  # The scale of image.
+        ratio_range=(0.5, 2.0),  # The augmented scale range as ratio.
+        keep_ratio=True),  # Whether to keep the aspect ratio when resizing the image.
+    dict(type='RandomCrop',  # Augmentation pipeline that randomly crop a patch from current image.
+        crop_size=crop_size,  # The crop size of patch.
+        cat_max_ratio=0.75),  # The max area ratio that could be occupied by single category.
+    dict(type='RandomFlip',  # Augmentation pipeline that flip the images and their annotations
+        prob=0.5),  # The ratio or probability to flip
+    dict(type='PhotoMetricDistortion'),  # Augmentation pipeline that distort current image with several photo metric methods.
+    dict(type='PackSegInputs')  # Pack the inputs data for the semantic segmentation.
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),  # First pipeline to load images from file path
+    dict(type='Resize',  # Use resize augmentation
+        scale=(2048, 1024),  # Images scales for resizing.
+        keep_ratio=True),  # Whether to keep the aspect ratio when resizing the image.
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),  # Load annotations for semantic segmentation provided by dataset.
+    dict(type='PackSegInputs')  # Pack the inputs data for the semantic segmentation.
+]
+train_dataloader = dict(  # Train dataloader config
+    batch_size=2,  # Batch size of a single GPU
+    num_workers=2,  # Worker to pre-fetch data for each single GPU
+    persistent_workers=True,  # Shut down the worker processes after an epoch end, which can accelerate training speed.
+    sampler=dict(type='InfiniteSampler', shuffle=True),  # Randomly shuffle during training.
+    dataset=dict(  # Train dataset config
+        type=dataset_type,  # Type of dataset, refer to mmseg/datasets/ for details.
+        data_root=data_root,  # The root of dataset.
+        data_prefix=dict(
+            img_path='leftImg8bit/train', seg_map_path='gtFine/train'),  # Prefix for training data.
+        pipeline=train_pipeline)) # Processing pipeline. This is passed by the train_pipeline created before.
+val_dataloader = dict(
+    batch_size=1,  # Batch size of a single GPU
+    num_workers=4,  # Worker to pre-fetch data for each single GPU
+    persistent_workers=True,  # Shut down the worker processes after an epoch end, which can accelerate testing speed.
+    sampler=dict(type='DefaultSampler', shuffle=False),  # Not shuffle during validation and testing.
+    dataset=dict(  # Test dataset config
+        type=dataset_type,  # Type of dataset, refer to mmseg/datasets/ for details.
+        data_root=data_root,  # The root of dataset.
+        data_prefix=dict(
+            img_path='leftImg8bit/val', seg_map_path='gtFine/val'),  # Prefix for testing data.
+        pipeline=test_pipeline))  # Processing pipeline. This is passed by the test_pipeline created before.
+test_dataloader = val_dataloader
+# The metric to measure the accuracy. Here, we use IoUMetric.
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
+```
+
+`_base_/schedules/schedule_40k.py`
+
+```python
+# optimizer
+optimizer = dict(type='SGD', # Type of optimizers, refer to https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py for more details
+                lr=0.01,  # Learning rate of optimizers, see detail usages of the parameters in the documentation of PyTorch
+                momentum=0.9,  # Momentum
+                weight_decay=0.0005)  # Weight decay of SGD
+optim_wrapper = dict(type='OptimWrapper',  # Optimizer wrapper provides a common interface for updating parameters.
+                    optimizer=optimizer,  # Optimizer used to update model parameters.
+                    clip_grad=None)  # If ``clip_grad`` is not None, it will be the arguments of ``torch.nn.utils.clip_grad``.
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',  # The policy of scheduler, also support Step, CosineAnnealing, Cyclic, etc. Refer to details of supported LrUpdater from https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py
+        eta_min=1e-4,  # Minimum learning rate at the end of scheduling.
+        power=0.9,  # The power of polynomial decay.
+        begin=0,  # Step at which to start updating the parameters.
+        end=40000,  # Step at which to stop updating the parameters.
+        by_epoch=False)  # Whether count by epoch or not.
+]
+# training schedule for 40k iteration
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=40000, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# default hooks
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),  # Log the time spent during iteration.
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),  # Collect and write logs from different components of ``Runner``.
+    param_scheduler=dict(type='ParamSchedulerHook'),  # update some hyper-parameters in optimizer, e.g., learning rate.
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),  # Save checkpoints periodically.
+    sampler_seed=dict(type='DistSamplerSeedHook'))  # Data-loading sampler for distributed training.
+```
+
+in `_base_/default_runtime.py`
+
+```python
+# Set the default scope of the registry to mmseg.
+default_scope = 'mmseg'
+# environment
+env_cfg = dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+load_from = None  # Load checkpoint from file.
+resume = False  # Whether to resume from existed model.
+```
+
+These are all the configs for training and testing PSPNet, to load and parse them, we can use [Config](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) implemented in [MMEngine](https://github.com/open-mmlab/mmengine)
+
+```python
+from mmengine.config import Config
+
+cfg = Config.fromfile('configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py')
+print(cfg.train_dataloader)
+```
+
+```shell
+{'batch_size': 2,
+ 'num_workers': 2,
+ 'persistent_workers': True,
+ 'sampler': {'type': 'InfiniteSampler', 'shuffle': True},
+ 'dataset': {'type': 'CityscapesDataset',
+  'data_root': 'data/cityscapes/',
+  'data_prefix': {'img_path': 'leftImg8bit/train',
+   'seg_map_path': 'gtFine/train'},
+  'pipeline': [{'type': 'LoadImageFromFile'},
+   {'type': 'LoadAnnotations'},
+   {'type': 'RandomResize',
+    'scale': (2048, 1024),
+    'ratio_range': (0.5, 2.0),
+    'keep_ratio': True},
+   {'type': 'RandomCrop', 'crop_size': (512, 1024), 'cat_max_ratio': 0.75},
+   {'type': 'RandomFlip', 'prob': 0.5},
+   {'type': 'PhotoMetricDistortion'},
+   {'type': 'PackSegInputs'}]}}
+```
+
+`cfg` is an instance of `mmengine.config.Config`, its interface is the same as a dict object and also allows access config values as attributes. See [config tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) in [MMEngine](https://github.com/open-mmlab/mmengine) for more information.
+
+## FAQ
+
+### Ignore some fields in the base configs
+
+Sometimes, you may set `_delete_=True` to ignore some of the fields in base configs.
+See [config tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/config.html) in [MMEngine](https://github.com/open-mmlab/mmengine) for simple illustration.
+
+In MMSegmentation, for example, if you would like to modify the backbone of PSPNet with the following config file `pspnet.py`:
+
+```python
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
+```
+
+Load and parse the config file `pspnet.py` in the code as follows:
+
+```python
+from mmengine.config import Config
+
+cfg = Config.fromfile('pspnet.py')
+print(cfg.model)
+```
+
+```shell
+{'type': 'EncoderDecoder',
+ 'pretrained': 'torchvision://resnet50',
+ 'backbone': {'type': 'ResNetV1c',
+  'depth': 50,
+  'num_stages': 4,
+  'out_indices': (0, 1, 2, 3),
+  'dilations': (1, 1, 2, 4),
+  'strides': (1, 2, 1, 1),
+  'norm_cfg': {'type': 'SyncBN', 'requires_grad': True},
+  'norm_eval': False,
+  'style': 'pytorch',
+  'contract_dilation': True},
+ 'decode_head': {'type': 'PSPHead',
+  'in_channels': 2048,
+  'in_index': 3,
+  'channels': 512,
+  'pool_scales': (1, 2, 3, 6),
+  'dropout_ratio': 0.1,
+  'num_classes': 19,
+  'norm_cfg': {'type': 'SyncBN', 'requires_grad': True},
+  'align_corners': False,
+  'loss_decode': {'type': 'CrossEntropyLoss',
+   'use_sigmoid': False,
+   'loss_weight': 1.0}}}
+```
+
+`ResNet` and `HRNet` use different keywords to construct, write a new config file `hrnet.py` as follows:
+
+```python
+_base_ = 'pspnet.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w32',
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        norm_cfg=norm_cfg,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256)))))
+```
+
+Load and parse the config file `hrnet.py` in the code as follows:
+
+```python
+from mmengine.config import Config
+cfg = Config.fromfile('hrnet.py')
+print(cfg.model)
+```
+
+```shell
+{'type': 'EncoderDecoder',
+ 'pretrained': 'open-mmlab://msra/hrnetv2_w32',
+ 'backbone': {'type': 'HRNet',
+  'norm_cfg': {'type': 'SyncBN', 'requires_grad': True},
+  'extra': {'stage1': {'num_modules': 1,
+    'num_branches': 1,
+    'block': 'BOTTLENECK',
+    'num_blocks': (4,),
+    'num_channels': (64,)},
+   'stage2': {'num_modules': 1,
+    'num_branches': 2,
+    'block': 'BASIC',
+    'num_blocks': (4, 4),
+    'num_channels': (32, 64)},
+   'stage3': {'num_modules': 4,
+    'num_branches': 3,
+    'block': 'BASIC',
+    'num_blocks': (4, 4, 4),
+    'num_channels': (32, 64, 128)},
+   'stage4': {'num_modules': 3,
+    'num_branches': 4,
+    'block': 'BASIC',
+    'num_blocks': (4, 4, 4, 4),
+    'num_channels': (32, 64, 128, 256)}}},
+ 'decode_head': {'type': 'PSPHead',
+  'in_channels': 2048,
+  'in_index': 3,
+  'channels': 512,
+  'pool_scales': (1, 2, 3, 6),
+  'dropout_ratio': 0.1,
+  'num_classes': 19,
+  'norm_cfg': {'type': 'SyncBN', 'requires_grad': True},
+  'align_corners': False,
+  'loss_decode': {'type': 'CrossEntropyLoss',
+   'use_sigmoid': False,
+   'loss_weight': 1.0}}}
+```
+
+The `_delete_=True` would replace all old keys in `backbone` field with new keys.
+
+### Use intermediate variables in configs
+
+Some intermediate variables are used in the configs files, like `train_pipeline`/`test_pipeline` in datasets.
+It's worth noting that when modifying intermediate variables in the children configs, user need to pass the intermediate variables into corresponding fields again.
+For example, we would like to change multi scale strategy to train/test a PSPNet. `train_pipeline`/`test_pipeline` are intermediate variable we would like to modify.
+
+```python
+_base_ = '../pspnet/pspnet_r50-d8_4xb4-40k_cityscpaes-512x1024.py'
+crop_size = (512, 1024)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='RandomResize',
+         img_scale=(2048, 1024),
+         ratio_range=(1., 2.),
+         keep_ration=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize',
+        scale=(2048, 1024),
+        keep_ratio=True),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='leftImg8bit/train', seg_map_path='gtFine/train'),
+        pipeline=train_pipeline)
+test_dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='leftImg8bit/val', seg_map_path='gtFine/val'),
+        pipeline=test_pipeline)
+train_dataloader = dict(dataset=train_dataset)
+val_dataloader = dict(dataset=test_dataset)
+test_dataloader = val_dataloader
+```
+
+We first define the new `train_pipeline`/`test_pipeline` and pass them into `dataset`.
+
+Similarly, if we would like to switch from `SyncBN` to `BN` or `MMSyncBN`, we need to substitute every `norm_cfg` in the config.
+
+```python
+_base_ = '../pspnet/pspnet_r50-d8_4xb4-40k_cityscpaes-512x1024.py'
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    backbone=dict(norm_cfg=norm_cfg),
+    decode_head=dict(norm_cfg=norm_cfg),
+    auxiliary_head=dict(norm_cfg=norm_cfg))
+```
+
+## Modify config through script arguments
+
+In the [training script](https://github.com/open-mmlab/mmsegmentation/blob/1.x/tools/train.py) and the [testing script](https://github.com/open-mmlab/mmsegmentation/blob/1.x/tools/test.py), we support the script argument `--cfg-options`, it may help users override some settings in the used config, the key-value pair in `xxx=yyy` format will be merged into config file.
+
+For example, this is a simplified script `demo_script.py`:
+
+```python
+import argparse
+
+from mmengine.config import Config, DictAction
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Script Example')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    print(cfg)
+
+if __name__ == '__main__':
+    main()
+```
+
+An example config file `demo_config.py` as follows:
+
+```python
+backbone = dict(
+    type='ResNetV1c',
+    depth=50,
+    num_stages=4,
+    out_indices=(0, 1, 2, 3),
+    dilations=(1, 1, 2, 4),
+    strides=(1, 2, 1, 1),
+    norm_eval=False,
+    style='pytorch',
+    contract_dilation=True)
+```
+
+Run `demo_script.py`:
+
+```shell
+python demo_script.py demo_config.py
+```
+
+```shell
+Config (path: demo_config.py): {'backbone': {'type': 'ResNetV1c', 'depth': 50, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'dilations': (1, 1, 2, 4), 'strides': (1, 2, 1, 1), 'norm_eval': False, 'style': 'pytorch', 'contract_dilation': True}}
+```
+
+Modify config through script arguments:
+
+```shell
+python demo_script.py demo_config.py --cfg-options backbone.depth=101
+```
+
+```shell
+Config (path: demo_config.py): {'backbone': {'type': 'ResNetV1c', 'depth': 101, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'dilations': (1, 1, 2, 4), 'strides': (1, 2, 1, 1), 'norm_eval': False, 'style': 'pytorch', 'contract_dilation': True}}
+```
+
+- Update values of list/tuples.
+
+  If the value to be updated is a list or a tuple. For example, the config file `demo_config.py` sets `strides=(1, 2, 1, 1)` in `backbone`.
+  If you want to change this key, you may specify in two ways:
+
+  1. `--cfg-options backbone.strides="(1, 1, 1, 1)"`. Note that the quotation mark " is necessary to support list/tuple data types.
+
+     ```shell
+     python demo_script.py demo_config.py --cfg-options backbone.strides="(1, 1, 1, 1)"
+     ```
+
+     ```shell
+     Config (path: demo_config.py): {'backbone': {'type': 'ResNetV1c', 'depth': 50, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'dilations': (1, 1, 2, 4), 'strides': (1, 1, 1, 1), 'norm_eval': False, 'style': 'pytorch', 'contract_dilation': True}}
+     ```
+
+  2. `--cfg-options backbone.strides=1,1,1,1`. Note that **NO** white space is allowed in the specified value.
+     In addition, if the original type is tuple, it will be automatically converted to list after this way.
+
+     ```shell
+     python demo_script.py demo_config.py --cfg-options backbone.strides=1,1,1,1
+     ```
+
+     ```shell
+     Config (path: demo_config.py): {'backbone': {'type': 'ResNetV1c', 'depth': 50, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'dilations': (1, 1, 2, 4), 'strides': [1, 1, 1, 1], 'norm_eval': False, 'style': 'pytorch', 'contract_dilation': True}}
+     ```
+
+```{note}
+    This modification method only supports modifying configuration items of string, int, float, boolean, None, list and tuple types.
+    More specifically, for list and tuple types, the elements inside them must also be one of the above seven types.
+```
diff --git a/docs/en/user_guides/2_dataset_prepare.md b/docs/en/user_guides/2_dataset_prepare.md
new file mode 100644
index 0000000000..2816a51f0d
--- /dev/null
+++ b/docs/en/user_guides/2_dataset_prepare.md
@@ -0,0 +1,754 @@
+# Tutorial 2: Prepare datasets
+
+It is recommended to symlink the dataset root to `$MMSEGMENTATION/data`.
+If your folder structure is different, you may need to change the corresponding paths in config files.
+For users in China, we also recommend you get the dsdl dataset from our opensource platform [OpenDataLab](https://opendatalab.com/), for better download and use experience，here is an example: [DSDLReadme](../../../configs/dsdl/README.md)， welcome to try.
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── cityscapes
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── VOCdevkit
+│   │   ├── VOC2012
+│   │   │   ├── JPEGImages
+│   │   │   ├── SegmentationClass
+│   │   │   ├── ImageSets
+│   │   │   │   ├── Segmentation
+│   │   ├── VOC2010
+│   │   │   ├── JPEGImages
+│   │   │   ├── SegmentationClassContext
+│   │   │   ├── ImageSets
+│   │   │   │   ├── SegmentationContext
+│   │   │   │   │   ├── train.txt
+│   │   │   │   │   ├── val.txt
+│   │   │   ├── trainval_merged.json
+│   │   ├── VOCaug
+│   │   │   ├── dataset
+│   │   │   │   ├── cls
+│   ├── ade
+│   │   ├── ADEChallengeData2016
+│   │   │   ├── annotations
+│   │   │   │   ├── training
+│   │   │   │   ├── validation
+│   │   │   ├── images
+│   │   │   │   ├── training
+│   │   │   │   ├── validation
+│   ├── coco_stuff10k
+│   │   ├── images
+│   │   │   ├── train2014
+│   │   │   ├── test2014
+│   │   ├── annotations
+│   │   │   ├── train2014
+│   │   │   ├── test2014
+│   │   ├── imagesLists
+│   │   │   ├── train.txt
+│   │   │   ├── test.txt
+│   │   │   ├── all.txt
+│   ├── coco_stuff164k
+│   │   ├── images
+│   │   │   ├── train2017
+│   │   │   ├── val2017
+│   │   ├── annotations
+│   │   │   ├── train2017
+│   │   │   ├── val2017
+│   ├── CHASE_DB1
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   ├── DRIVE
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   ├── HRF
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   ├── STARE
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+|   ├── dark_zurich
+|   │   ├── gps
+|   │   │   ├── val
+|   │   │   └── val_ref
+|   │   ├── gt
+|   │   │   └── val
+|   │   ├── LICENSE.txt
+|   │   ├── lists_file_names
+|   │   │   ├── val_filenames.txt
+|   │   │   └── val_ref_filenames.txt
+|   │   ├── README.md
+|   │   └── rgb_anon
+|   │   |   ├── val
+|   │   |   └── val_ref
+|   ├── NighttimeDrivingTest
+|   |   ├── gtCoarse_daytime_trainvaltest
+|   |   │   └── test
+|   |   │       └── night
+|   |   └── leftImg8bit
+|   |   |   └── test
+|   |   |       └── night
+│   ├── loveDA
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   │   ├── test
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── potsdam
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── vaihingen
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── iSAID
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   │   ├── test
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── synapse
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── REFUGE
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── test
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── test
+│   ├── mapillary
+│   │   ├── training
+│   │   │   ├── images
+│   │   │   ├── v1.2
+|   │   │   │   ├── instances
+|   │   │   │   ├── labels
+|   │   │   │   └── panoptic
+│   │   │   ├── v2.0
+|   │   │   │   ├── instances
+|   │   │   │   ├── labels
+|   │   │   │   ├── panoptic
+|   │   │   │   └── polygons
+│   │   ├── validation
+│   │   │   ├── images
+|   │   │   ├── v1.2
+|   │   │   │   ├── instances
+|   │   │   │   ├── labels
+|   │   │   │   └── panoptic
+│   │   │   ├── v2.0
+|   │   │   │   ├── instances
+|   │   │   │   ├── labels
+|   │   │   │   ├── panoptic
+|   │   │   │   └── polygons
+│   ├── bdd100k
+│   │   ├── images
+│   │   │   └── 10k
+|   │   │   │   ├── test
+|   │   │   │   ├── train
+|   │   │   │   └── val
+│   │   └── labels
+│   │   │   └── sem_seg
+|   │   │   │   ├── colormaps
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── masks
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── polygons
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+|   │   │   │   └── rles
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+│   ├── nyu
+│   │   ├── images
+│   │   │   ├── train
+│   │   │   ├── test
+│   │   ├── annotations
+│   │   │   ├── train
+│   │   │   ├── test
+```
+
+## Download dataset via MIM
+
+By using [OpenXLab](https://openxlab.org.cn/datasets), you can obtain free formatted datasets in various fields. Through the search function of the platform, you may address the dataset they look for quickly and easily. Using the formatted datasets from the platform, you can efficiently conduct tasks across datasets.
+
+If you use MIM to download, make sure that the version is greater than v0.3.8. You can use the following command to update, install, login and download the dataset:
+
+```shell
+# upgrade your MIM
+pip install -U openmim
+
+# install OpenXLab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
+
+# download ADE20K by MIM
+mim download mmsegmentation --dataset ade20k
+```
+
+## Cityscapes
+
+The data could be found [here](https://www.cityscapes-dataset.com/downloads/) after registration.
+
+By convention, `**labelTrainIds.png` are used for cityscapes training.
+We provided a [script](https://github.com/open-mmlab/mmsegmentation/blob/1.x/tools/dataset_converters/cityscapes.py) based on [cityscapesscripts](https://github.com/mcordts/cityscapesScripts)to generate `**labelTrainIds.png`.
+
+```shell
+# --nproc means 8 process for conversion, which could be omitted as well.
+python tools/dataset_converters/cityscapes.py data/cityscapes --nproc 8
+```
+
+## Pascal VOC
+
+Pascal VOC 2012 could be downloaded from [here](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar).
+Beside, most recent works on Pascal VOC dataset usually exploit extra augmentation data, which could be found [here](http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz).
+
+If you would like to use augmented VOC dataset, please run following command to convert augmentation annotations into proper format.
+
+```shell
+# --nproc means 8 process for conversion, which could be omitted as well.
+python tools/dataset_converters/voc_aug.py data/VOCdevkit data/VOCdevkit/VOCaug --nproc 8
+```
+
+Please refer to [concat dataset](../advanced_guides/add_datasets.md#concatenate-dataset) and [voc_aug config example](../../../configs/_base_/datasets/pascal_voc12_aug.py) for details about how to concatenate them and train them together.
+
+## ADE20K
+
+The training and validation set of ADE20K could be download from this [link](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip).
+We may also download test set from [here](http://data.csail.mit.edu/places/ADEchallenge/release_test.zip).
+
+## Pascal Context
+
+The training and validation set of Pascal Context could be download from [here](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar). You may also download test set from [here](http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2010test.tar) after registration.
+
+To split the training and validation set from original dataset, you may download trainval_merged.json from [here](https://codalabuser.blob.core.windows.net/public/trainval_merged.json).
+
+If you would like to use Pascal Context dataset, please install [Detail](https://github.com/zhanghang1989/detail-api) and then run the following command to convert annotations into proper format.
+
+```shell
+python tools/dataset_converters/pascal_context.py data/VOCdevkit data/VOCdevkit/VOC2010/trainval_merged.json
+```
+
+## COCO Stuff 10k
+
+The data could be downloaded [here](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip) by wget.
+
+For COCO Stuff 10k dataset, please run the following commands to download and convert the dataset.
+
+```shell
+# download
+mkdir coco_stuff10k && cd coco_stuff10k
+wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip
+
+# unzip
+unzip cocostuff-10k-v1.1.zip
+
+# --nproc means 8 process for conversion, which could be omitted as well.
+python tools/dataset_converters/coco_stuff10k.py /path/to/coco_stuff10k --nproc 8
+```
+
+By convention, mask labels in `/path/to/coco_stuff164k/annotations/*2014/*_labelTrainIds.png` are used for COCO Stuff 10k training and testing.
+
+## COCO Stuff 164k
+
+For COCO Stuff 164k dataset, please run the following commands to download and convert the augmented dataset.
+
+```shell
+# download
+mkdir coco_stuff164k && cd coco_stuff164k
+wget http://images.cocodataset.org/zips/train2017.zip
+wget http://images.cocodataset.org/zips/val2017.zip
+wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip
+
+# unzip
+unzip train2017.zip -d images/
+unzip val2017.zip -d images/
+unzip stuffthingmaps_trainval2017.zip -d annotations/
+
+# --nproc means 8 process for conversion, which could be omitted as well.
+python tools/dataset_converters/coco_stuff164k.py /path/to/coco_stuff164k --nproc 8
+```
+
+By convention, mask labels in `/path/to/coco_stuff164k/annotations/*2017/*_labelTrainIds.png` are used for COCO Stuff 164k training and testing.
+
+The details of this dataset could be found at [here](https://github.com/nightrome/cocostuff#downloads).
+
+## CHASE DB1
+
+The training and validation set of CHASE DB1 could be download from [here](https://staffnet.kingston.ac.uk/~ku15565/CHASE_DB1/assets/CHASEDB1.zip).
+
+To convert CHASE DB1 dataset to MMSegmentation format, you should run the following command:
+
+```shell
+python tools/dataset_converters/chase_db1.py /path/to/CHASEDB1.zip
+```
+
+The script will make directory structure automatically.
+
+## DRIVE
+
+The training and validation set of DRIVE could be download from [here](https://drive.grand-challenge.org/). Before that, you should register an account. Currently '1st_manual' is not provided officially.
+
+To convert DRIVE dataset to MMSegmentation format, you should run the following command:
+
+```shell
+python tools/dataset_converters/drive.py /path/to/training.zip /path/to/test.zip
+```
+
+The script will make directory structure automatically.
+
+## HRF
+
+First, download [healthy.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/healthy.zip), [glaucoma.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/glaucoma.zip), [diabetic_retinopathy.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/diabetic_retinopathy.zip), [healthy_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/healthy_manualsegm.zip), [glaucoma_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/glaucoma_manualsegm.zip) and [diabetic_retinopathy_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/diabetic_retinopathy_manualsegm.zip).
+
+To convert HRF dataset to MMSegmentation format, you should run the following command:
+
+```shell
+python tools/dataset_converters/hrf.py /path/to/healthy.zip /path/to/healthy_manualsegm.zip /path/to/glaucoma.zip /path/to/glaucoma_manualsegm.zip /path/to/diabetic_retinopathy.zip /path/to/diabetic_retinopathy_manualsegm.zip
+```
+
+The script will make directory structure automatically.
+
+## STARE
+
+First, download [stare-images.tar](http://cecas.clemson.edu/~ahoover/stare/probing/stare-images.tar), [labels-ah.tar](http://cecas.clemson.edu/~ahoover/stare/probing/labels-ah.tar) and [labels-vk.tar](http://cecas.clemson.edu/~ahoover/stare/probing/labels-vk.tar).
+
+To convert STARE dataset to MMSegmentation format, you should run the following command:
+
+```shell
+python tools/dataset_converters/stare.py /path/to/stare-images.tar /path/to/labels-ah.tar /path/to/labels-vk.tar
+```
+
+The script will make directory structure automatically.
+
+## Dark Zurich
+
+Since we only support test models on this dataset, you may only download [the validation set](https://data.vision.ee.ethz.ch/csakarid/shared/GCMA_UIoU/Dark_Zurich_val_anon.zip).
+
+## Nighttime Driving
+
+Since we only support test models on this dataset, you may only download [the test set](http://data.vision.ee.ethz.ch/daid/NighttimeDriving/NighttimeDrivingTest.zip).
+
+## LoveDA
+
+The data could be downloaded from Google Drive [here](https://drive.google.com/drive/folders/1ibYV0qwn4yuuh068Rnc-w4tPi0U0c-ti?usp=sharing).
+
+Or it can be downloaded from [zenodo](https://zenodo.org/record/5706578#.YZvN7SYRXdF), you should run the following command:
+
+```shell
+# Download Train.zip
+wget https://zenodo.org/record/5706578/files/Train.zip
+# Download Val.zip
+wget https://zenodo.org/record/5706578/files/Val.zip
+# Download Test.zip
+wget https://zenodo.org/record/5706578/files/Test.zip
+```
+
+For LoveDA dataset, please run the following command to re-organize the dataset.
+
+```shell
+python tools/dataset_converters/loveda.py /path/to/loveDA
+```
+
+Using trained model to predict test set of LoveDA and submit it to server can be found [here](https://codalab.lisn.upsaclay.fr/competitions/421).
+
+More details about LoveDA can be found [here](https://github.com/Junjue-Wang/LoveDA).
+
+## ISPRS Potsdam
+
+The [Potsdam](https://www.isprs.org/education/benchmarks/UrbanSemLab/2d-sem-label-potsdam.aspx) dataset is for urban semantic segmentation used in the 2D Semantic Labeling Contest - Potsdam.
+
+The dataset can be requested at the challenge [homepage](https://www.isprs.org/education/benchmarks/UrbanSemLab/default.aspx).
+Or download on [BaiduNetdisk](https://pan.baidu.com/s/1K-cLVZnd1X7d8c26FQ-nGg?pwd=mseg)，password：mseg, [Google Drive](https://drive.google.com/drive/folders/1w3EJuyUGet6_qmLwGAWZ9vw5ogeG0zLz?usp=sharing) and [OpenDataLab](https://opendatalab.com/ISPRS_Potsdam/download).
+The '2_Ortho_RGB.zip' and '5_Labels_all_noBoundary.zip' are required.
+
+For Potsdam dataset, please run the following command to re-organize the dataset.
+
+```shell
+python tools/dataset_converters/potsdam.py /path/to/potsdam
+```
+
+In our default setting, it will generate 3456 images for training and 2016 images for validation.
+
+## ISPRS Vaihingen
+
+The [Vaihingen](https://www2.isprs.org/commissions/comm2/wg4/benchmark/2d-sem-label-vaihingen/) dataset is for urban semantic segmentation used in the 2D Semantic Labeling Contest - Vaihingen.
+
+The dataset can be requested at the challenge [homepage](https://www2.isprs.org/commissions/comm2/wg4/benchmark/data-request-form/).
+Or [BaiduNetdisk](https://pan.baidu.com/s/109D3WLrLafsuYtLeerLiiA?pwd=mseg)，password：mseg, [Google Drive](https://drive.google.com/drive/folders/1w3NhvLVA2myVZqOn2pbiDXngNC7NTP_t?usp=sharing).
+The 'ISPRS_semantic_labeling_Vaihingen.zip' and 'ISPRS_semantic_labeling_Vaihingen_ground_truth_eroded_COMPLETE.zip' are required.
+
+For Vaihingen dataset, please run the following command to re-organize the dataset.
+
+```shell
+python tools/dataset_converters/vaihingen.py /path/to/vaihingen
+```
+
+In our default setting (`clip_size`=512, `stride_size`=256), it will generate 344 images for training and 398 images for validation.
+
+## iSAID
+
+The data images could be download from [DOTA-v1.0](https://captain-whu.github.io/DOTA/dataset.html) (train/val/test)
+
+The data annotations could be download from [iSAID](https://captain-whu.github.io/iSAID/dataset.html) (train/val)
+
+The dataset is a Large-scale Dataset for Instance Segmentation (also have semantic segmentation) in Aerial Images.
+
+You may need to follow the following structure for dataset preparation after downloading iSAID dataset.
+
+```none
+├── data
+│   ├── iSAID
+│   │   ├── train
+│   │   │   ├── images
+│   │   │   │   ├── part1.zip
+│   │   │   │   ├── part2.zip
+│   │   │   │   ├── part3.zip
+│   │   │   ├── Semantic_masks
+│   │   │   │   ├── images.zip
+│   │   ├── val
+│   │   │   ├── images
+│   │   │   │   ├── part1.zip
+│   │   │   ├── Semantic_masks
+│   │   │   │   ├── images.zip
+│   │   ├── test
+│   │   │   ├── images
+│   │   │   │   ├── part1.zip
+│   │   │   │   ├── part2.zip
+```
+
+```shell
+python tools/dataset_converters/isaid.py /path/to/iSAID
+```
+
+In our default setting (`patch_width`=896, `patch_height`=896, `overlap_area`=384), it will generate 33978 images for training and 11644 images for validation.
+
+## LIP(Look Into Person) dataset
+
+This dataset could be download from [this page](https://lip.sysuhcp.com/overview.php).
+
+Please run the following commands to unzip dataset.
+
+```shell
+unzip LIP.zip
+cd LIP
+unzip TrainVal_images.zip
+unzip TrainVal_parsing_annotations.zip
+cd TrainVal_parsing_annotations
+unzip TrainVal_parsing_annotations.zip
+mv train_segmentations ../
+mv val_segmentations ../
+cd ..
+```
+
+The contents of LIP datasets include:
+
+```none
+├── data
+│   ├── LIP
+│   │   ├── train_images
+│   │   │   ├── 1000_1234574.jpg
+│   │   │   ├── ...
+│   │   ├── train_segmentations
+│   │   │   ├── 1000_1234574.png
+│   │   │   ├── ...
+│   │   ├── val_images
+│   │   │   ├── 100034_483681.jpg
+│   │   │   ├── ...
+│   │   ├── val_segmentations
+│   │   │   ├── 100034_483681.png
+│   │   │   ├── ...
+```
+
+## Synapse dataset
+
+This dataset could be download from [this page](https://www.synapse.org/#!Synapse:syn3193805/wiki/).
+
+To follow the data preparation setting of [TransUNet](https://arxiv.org/abs/2102.04306), which splits original training set (30 scans) into new training (18 scans) and validation set (12 scans). Please run the following command to prepare the dataset.
+
+```shell
+unzip RawData.zip
+cd ./RawData/Training
+```
+
+Then create `train.txt` and `val.txt` to split dataset.
+
+According to TransUnet, the following is the data set division.
+
+train.txt
+
+```none
+img0005.nii.gz
+img0006.nii.gz
+img0007.nii.gz
+img0009.nii.gz
+img0010.nii.gz
+img0021.nii.gz
+img0023.nii.gz
+img0024.nii.gz
+img0026.nii.gz
+img0027.nii.gz
+img0028.nii.gz
+img0030.nii.gz
+img0031.nii.gz
+img0033.nii.gz
+img0034.nii.gz
+img0037.nii.gz
+img0039.nii.gz
+img0040.nii.gz
+```
+
+val.txt
+
+```none
+img0008.nii.gz
+img0022.nii.gz
+img0038.nii.gz
+img0036.nii.gz
+img0032.nii.gz
+img0002.nii.gz
+img0029.nii.gz
+img0003.nii.gz
+img0001.nii.gz
+img0004.nii.gz
+img0025.nii.gz
+img0035.nii.gz
+```
+
+The contents of synapse datasets include:
+
+```none
+├── Training
+│   ├── img
+│   │   ├── img0001.nii.gz
+│   │   ├── img0002.nii.gz
+│   │   ├── ...
+│   ├── label
+│   │   ├── label0001.nii.gz
+│   │   ├── label0002.nii.gz
+│   │   ├── ...
+│   ├── train.txt
+│   ├── val.txt
+```
+
+Then, use this command to convert synapse dataset.
+
+```shell
+python tools/dataset_converters/synapse.py --dataset-path /path/to/synapse
+```
+
+Noted that MMSegmentation default evaluation metric (such as mean dice value) is calculated on 2D slice image, which is not comparable to results of 3D scan in some paper such as [TransUNet](https://arxiv.org/abs/2102.04306).
+
+## REFUGE
+
+Register in [REFUGE Challenge](https://refuge.grand-challenge.org) and download [REFUGE dataset](https://refuge.grand-challenge.org/REFUGE2Download).
+
+Then, unzip `REFUGE2.zip` and the contents of original datasets include:
+
+```none
+├── REFUGE2
+│   ├── REFUGE2
+│   │   ├── Annotation-Training400.zip
+│   │   ├── REFUGE-Test400.zip
+│   │   ├── REFUGE-Test-GT.zip
+│   │   ├── REFUGE-Training400.zip
+│   │   ├── REFUGE-Validation400.zip
+│   │   ├── REFUGE-Validation400-GT.zip
+│   ├── __MACOSX
+```
+
+Please run the following command to convert REFUGE dataset:
+
+```shell
+python tools/convert_datasets/refuge.py --raw_data_root=/path/to/refuge/REFUGE2/REFUGE2
+```
+
+The script will make directory structure below:
+
+```none
+│   ├── REFUGE
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── test
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── test
+```
+
+It includes 400 images for training, 400 images for validation and 400 images for testing which is the same as REFUGE 2018 dataset.
+
+## Mapillary Vistas Datasets
+
+- The dataset could be download [here](https://www.mapillary.com/dataset/vistas) after registration.
+
+- Mapillary Vistas Dataset use 8-bit with color-palette to store labels. No conversion operation is required.
+
+- Assumption you have put the dataset zip file in `mmsegmentation/data/mapillary`
+
+- Please run the following commands to unzip dataset.
+
+  ```bash
+  cd data/mapillary
+  unzip An-ZjB1Zm61yAZG0ozTymz8I8NqI4x0MrYrh26dq7kPgfu8vf9ImrdaOAVOFYbJ2pNAgUnVGBmbue9lTgdBOb5BbKXIpFs0fpYWqACbrQDChAA2fdX0zS9PcHu7fY8c-FOvyBVxPNYNFQuM.zip
+  ```
+
+- After unzip, you will get Mapillary Vistas Dataset like this structure. Semantic segmentation mask labels in `labels` folder.
+
+  ```none
+  mmsegmentation
+  ├── mmseg
+  ├── tools
+  ├── configs
+  ├── data
+  │   ├── mapillary
+  │   │   ├── training
+  │   │   │   ├── images
+  │   │   │   ├── v1.2
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   └── panoptic
+  │   │   │   ├── v2.0
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── panoptic
+  |   │   │   │   └── polygons
+  │   │   ├── validation
+  │   │   │   ├── images
+  |   │   │   ├── v1.2
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   └── panoptic
+  │   │   │   ├── v2.0
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── panoptic
+  |   │   │   │   └── polygons
+  ```
+
+- You could set Datasets version with `MapillaryDataset_v1` and `MapillaryDataset_v2` in your configs.
+  View the Mapillary Vistas Datasets config file here [V1.2](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/_base_/datasets/mapillary_v1.py) and [V2.0](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/_base_/datasets/mapillary_v2.py)
+
+## LEVIR-CD
+
+[LEVIR-CD](https://justchenhao.github.io/LEVIR/) Large-scale Remote Sensing Change Detection Dataset for Building.
+
+Download the dataset from [here](https://justchenhao.github.io/LEVIR/).
+
+The supplement version of the dataset can be requested on the [homepage](https://github.com/S2Looking/Dataset)
+
+Please download the supplement version of the dataset, then unzip `LEVIR-CD+.zip`, the contents of original datasets include:
+
+```none
+│   ├── LEVIR-CD+
+│   │   ├── train
+│   │   │   ├── A
+│   │   │   ├── B
+│   │   │   ├── label
+│   │   ├── test
+│   │   │   ├── A
+│   │   │   ├── B
+│   │   │   ├── label
+```
+
+For LEVIR-CD dataset, please run the following command to crop images without overlap:
+
+```shell
+python tools/dataset_converters/levircd.py --dataset-path /path/to/LEVIR-CD+ --out_dir /path/to/LEVIR-CD
+```
+
+The size of cropped image is 256x256, which is consistent with the original paper.
+
+## BDD100K
+
+- You could download BDD100k datasets from  [here](https://bdd-data.berkeley.edu/) after  registration.
+
+- You can download images and masks by clicking  `10K Images` button and `Segmentation` button.
+
+- After download, unzip by the following instructions:
+
+  ```bash
+  unzip ~/bdd100k_images_10k.zip -d ~/mmsegmentation/data/
+  unzip ~/bdd100k_sem_seg_labels_trainval.zip -d ~/mmsegmentation/data/
+  ```
+
+- And get
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── bdd100k
+│   │   ├── images
+│   │   │   └── 10k
+|   │   │   │   ├── test
+|   │   │   │   ├── train
+|   │   │   │   └── val
+│   │   └── labels
+│   │   │   └── sem_seg
+|   │   │   │   ├── colormaps
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── masks
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── polygons
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+|   │   │   │   └── rles
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+```
+
+## NYU
+
+- To access the NYU dataset, you can download it from [this link](https://drive.google.com/file/d/1wC-io-14RCIL4XTUrQLk6lBqU2AexLVp/view?usp=share_link)
+
+- Once the download is complete, you can utilize the [tools/dataset_converters/nyu.py](/tools/dataset_converters/nyu.py) script to extract and organize the data into the required format. Run the following command in your terminal:
+
+  ```bash
+  python tools/dataset_converters/nyu.py nyu.zip
+  ```
diff --git a/docs/en/user_guides/3_inference.md b/docs/en/user_guides/3_inference.md
new file mode 100644
index 0000000000..cacebd2f60
--- /dev/null
+++ b/docs/en/user_guides/3_inference.md
@@ -0,0 +1,244 @@
+# Tutorial 3: Inference with existing models
+
+MMSegmentation provides pre-trained models for semantic segmentation in [Model Zoo](../model_zoo.md), and supports multiple standard datasets, including Cityscapes, ADE20K, etc.
+This note will show how to use existing models to inference on given images.
+As for how to test existing models on standard datasets, please see this [guide](./4_train_test.md)
+
+MMSegmentation provides several interfaces for users to easily use pre-trained models for inference.
+
+- [Tutorial 3: Inference with existing models](#tutorial-3-inference-with-existing-models)
+  - [Inferencer](#inferencer)
+    - [Basic Usage](#basic-usage)
+    - [Initialization](#initialization)
+    - [Visualize prediction](#visualize-prediction)
+    - [List model](#list-model)
+  - [Inference API](#inference-api)
+    - [mmseg.apis.init_model](#mmsegapisinit_model)
+    - [mmseg.apis.inference_model](#mmsegapisinference_model)
+    - [mmseg.apis.show_result_pyplot](#mmsegapisshow_result_pyplot)
+
+## Inferencer
+
+We provide the most **convenient** way to use the model in MMSegmentation `MMSegInferencer`. You can get segmentation mask for an image with only 3 lines of code.
+
+### Basic Usage
+
+The following example shows how to use `MMSegInferencer` to perform inference on a single image.
+
+```
+>>> from mmseg.apis import MMSegInferencer
+>>> # Load models into memory
+>>> inferencer = MMSegInferencer(model='deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024')
+>>> # Inference
+>>> inferencer('demo/demo.png', show=True)
+```
+
+The visualization result should look like:
+
+<div align="center">
+    <img src='https://user-images.githubusercontent.com/76149310/221507927-ae01e3a7-016f-4425-b966-7b19cbbe494e.png' />
+</div>
+
+Moreover, you can use `MMSegInferencer` to process a list of images:
+
+```
+# Input a list of images
+>>> images = [image1, image2, ...] # image1 can be a file path or a np.ndarray
+>>> inferencer(images, show=True, wait_time=0.5) # wait_time is delay time, and 0 means forever
+
+# Or input image directory
+>>> images = $IMAGESDIR
+>>> inferencer(images, show=True, wait_time=0.5)
+
+# Save visualized rendering color maps and predicted results
+# out_dir is the directory to save the output results, img_out_dir and pred_out_dir are subdirectories of out_dir
+# to save visualized rendering color maps and predicted results
+>>> inferencer(images, out_dir='outputs', img_out_dir='vis', pred_out_dir='pred')
+```
+
+There is a optional parameter of inferencer, `return_datasamples`, whose default value is False, and return value of inferencer is a `dict` type by default, including 2 keys 'visualization' and 'predictions'.
+If `return_datasamples=True` inferencer will return [`SegDataSample`](../advanced_guides/structures.md), or list of it.
+
+```
+result = inferencer('demo/demo.png')
+# result is a `dict` including 2 keys 'visualization' and 'predictions'
+# 'visualization' includes color segmentation map
+print(result['visualization'].shape)
+# (512, 683, 3)
+
+# 'predictions' includes segmentation mask with label indice
+print(result['predictions'].shape)
+# (512, 683)
+
+result = inferencer('demo/demo.png', return_datasamples=True)
+print(type(result))
+# <class 'mmseg.structures.seg_data_sample.SegDataSample'>
+
+# Input a list of images
+results = inferencer(images)
+# The output is list
+print(type(results['visualization']), results['visualization'][0].shape)
+# <class 'list'> (512, 683, 3)
+print(type(results['predictions']), results['predictions'][0].shape)
+# <class 'list'> (512, 683)
+
+results = inferencer(images, return_datasamples=True)
+# <class 'list'>
+print(type(results[0]))
+# <class 'mmseg.structures.seg_data_sample.SegDataSample'>
+```
+
+### Initialization
+
+`MMSegInferencer` must be initialized from a `model`, which can be a model name or a `Config` even a path of config file.
+The model names can be found in models' metafile (configs/xxx/metafile.yaml), like one model name of maskformer is `maskformer_r50-d32_8xb2-160k_ade20k-512x512`, and if input model name and the weights of the model will be download automatically. Below are other input parameters:
+
+- weights (str, optional) -  Path to the checkpoint. If it is not specified and model is a model name of metafile, the weights will be loaded from metafile. Defaults to None.
+- classes (list, optional) - Input classes for result rendering, as the prediction of segmentation model is a segment map with label indices, `classes` is a list which includes items responding to the label indices. If classes is not defined, visualizer will take `cityscapes` classes by default. Defaults to None.
+- palette (list, optional) - Input palette for result rendering, which is a list of colors responding to the classes. If the palette is not defined, the visualizer will take the palette of `cityscapes` by default. Defaults to None.
+- dataset_name (str, optional) - [Dataset name or alias](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317), visualizer will use the meta information of the dataset i.e. classes and palette, but the `classes` and `palette` have higher priority. Defaults to None.
+- device (str, optional) - Device to run inference. If None, the available device will be automatically used. Defaults to None.
+- scope (str, optional) - The scope of the model. Defaults to 'mmseg'.
+
+### Visualize prediction
+
+`MMSegInferencer` supports 4 parameters for visualize prediction, you can use them when call initialized inferencer:
+
+- show (bool) - Whether to display the image in a popup window. Defaults to False.
+- wait_time (float) - The interval of show (s). Defaults to 0.
+- img_out_dir (str) - Subdirectory of `out_dir`, used to save rendering color segmentation mask, so `out_dir` must be defined if you would like to save predicted mask. Defaults to 'vis'.
+- opacity (int, float) - The transparency of segmentation mask. Defaults to 0.8.
+
+The examples of these parameters is in [Basic Usage](#basic-usage)
+
+### List model
+
+There is a very easy to list all model names in MMSegmentation
+
+```
+>>> from mmseg.apis import MMSegInferencer
+# models is a list of model names, and them will print automatically
+>>> models = MMSegInferencer.list_models('mmseg')
+```
+
+## Inference API
+
+### mmseg.apis.init_model
+
+Initialize a segmentor from config file.
+
+Parameters:
+
+- config (str, `Path`, or `mmengine.Config`) - Config file path or the config object.
+- checkpoint (str, optional) - Checkpoint path. If left as None, the model will not load any weights.
+- device (str, optional) - CPU/CUDA device option. Default 'cuda:0'.
+- cfg_options (dict, optional) - Options to override some settings in the used config.
+
+Returns:
+
+- nn.Module: The constructed segmentor.
+
+Example:
+
+```python
+from mmseg.apis import init_model
+
+config_path = 'configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+checkpoint_path = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
+
+# initialize model without checkpoint
+model = init_model(config_path)
+
+# init model and load checkpoint
+model = init_model(config_path, checkpoint_path)
+
+# init model and load checkpoint on CPU
+model = init_model(config_path, checkpoint_path, 'cpu')
+```
+
+### mmseg.apis.inference_model
+
+Inference image(s) with the segmentor.
+
+Parameters:
+
+- model (nn.Module) - The loaded segmentor
+- imgs (str, np.ndarray, or list\[str/np.ndarray\]) - Either image files or loaded images
+
+Returns:
+
+- `SegDataSample` or list\[`SegDataSample`\]: If imgs is a list or tuple, the same length list type results will be returned, otherwise return the segmentation results directly.
+
+**Note:** [SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py) is a data structure interface of MMSegmentation, it is used as interfaces between different components. `SegDataSample` implement the abstract data element `mmengine.structures.BaseDataElement`, please refer to data element [documentation](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_element.html) in [MMEngine](https://github.com/open-mmlab/mmengine) for more information.
+
+The attributes in `SegDataSample` are divided into several parts:
+
+- `gt_sem_seg` (`PixelData`) - Ground truth of semantic segmentation.
+- `pred_sem_seg` (`PixelData`) - Prediction of semantic segmentation.
+- `seg_logits` (`PixelData`) - Predicted logits of semantic segmentation.
+
+**Note** [PixelData](https://github.com/open-mmlab/mmengine/blob/main/mmengine/structures/pixel_data.py) is the data structure for pixel-level annotations or predictions, please refer to PixelData [documentation](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_element.html) in [MMEngine](https://github.com/open-mmlab/mmengine) for more information.
+
+Example:
+
+```python
+from mmseg.apis import init_model, inference_model
+
+config_path = 'configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+checkpoint_path = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
+img_path = 'demo/demo.png'
+
+
+model = init_model(config_path, checkpoint_path)
+result = inference_model(model, img_path)
+```
+
+### mmseg.apis.show_result_pyplot
+
+Visualize the segmentation results on the image.
+
+Parameters:
+
+- model (nn.Module) - The loaded segmentor.
+- img (str or np.ndarray) - Image filename or loaded image.
+- result (`SegDataSample`) - The prediction SegDataSample result.
+- opacity (float) - Opacity of painted segmentation map. Default `0.5`, must be in `(0, 1]` range.
+- title (str) - The title of pyplot figure. Default is ''.
+- draw_gt (bool) - Whether to draw GT SegDataSample. Default to `True`.
+- draw_pred (draws_pred) - Whether to draw Prediction SegDataSample. Default to `True`.
+- wait_time (float) - The interval of show (s), 0 is the special value that means "forever". Default to `0`.
+- show (bool) - Whether to display the drawn image. Default to `True`.
+- save_dir (str, optional) - Save file dir for all storage backends. If it is `None`, the backend storage will not save any data.
+- out_file (str, optional) - Path to output file. Default to `None`.
+
+Returns:
+
+- np.ndarray: the drawn image which channel is RGB.
+
+Example:
+
+```python
+from mmseg.apis import init_model, inference_model, show_result_pyplot
+
+config_path = 'configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+checkpoint_path = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
+img_path = 'demo/demo.png'
+
+
+# build the model from a config file and a checkpoint file
+model = init_model(config_path, checkpoint_path, device='cuda:0')
+
+# inference on given image
+result = inference_model(model, img_path)
+
+# display the segmentation result
+vis_image = show_result_pyplot(model, img_path, result)
+
+# save the visualization result, the output image would be found at the path `work_dirs/result.png`
+vis_iamge = show_result_pyplot(model, img_path, result, out_file='work_dirs/result.png')
+
+# Modify the time of displaying images, note that 0 is the special value that means "forever"
+vis_image = show_result_pyplot(model, img_path, result, wait_time=5)
+```
+
+**Note:** If your current device doesn't have graphical user interface, it is recommended that setting `show` to `False` and specify the `out_file` or `save_dir` to save the results. If you would like to display the result on a window, no special settings are required.
diff --git a/docs/en/user_guides/4_train_test.md b/docs/en/user_guides/4_train_test.md
new file mode 100644
index 0000000000..9b2d17dc46
--- /dev/null
+++ b/docs/en/user_guides/4_train_test.md
@@ -0,0 +1,315 @@
+# Tutorial 4: Train and test with existing models
+
+MMSegmentation supports training and testing models on a variety of devices, which are described below for single-GPU, distributed, and cluster training and testing, respectively. Through this tutorial, you will learn how to train and test using the scripts provided by MMSegmentation.
+
+## Training and testing on a single GPU
+
+### Training on a single GPU
+
+We provide `tools/train.py` to launch training jobs on a single GPU.
+The basic usage is as follows.
+
+```shell
+python tools/train.py  ${CONFIG_FILE} [optional arguments]
+```
+
+This tool accepts several optional arguments, including:
+
+- `--work-dir ${WORK_DIR}`: Override the working directory.
+- `--amp`: Use auto mixed precision training.
+- `--resume`: Resume from the latest checkpoint in the work_dir automatically.
+- `--cfg-options ${OVERRIDE_CONFIGS}`: Override some settings in the used config, and the key-value pair in xxx=yyy format will be merged into the config file.
+  For example, '--cfg-option model.encoder.in_channels=6'. Please see this [guide](./1_config.md#Modify-config-through-script-arguments) for more details.
+
+Below are the optional arguments for the multi-gpu test:
+
+- `--launcher`: Items for distributed job initialization launcher. Allowed choices are `none`, `pytorch`, `slurm`, `mpi`. Especially, if set to none, it will test in a non-distributed mode.
+- `--local_rank`: ID for local rank. If not specified, it will be set to 0.
+
+**Note:** Difference between the argument `--resume` and the field `load_from` in the config file:
+
+`--resume` only determines whether to resume from the latest checkpoint in the work_dir. It is usually used for resuming the training process that is interrupted accidentally.
+
+`load_from` will specify the checkpoint to be loaded and the training iteration starts from 0. It is usually used for fine-tuning.
+
+If you would like to resume training from a specific checkpoint, you can use:
+
+```python
+python tools/train.py ${CONFIG_FILE} --resume --cfg-options load_from=${CHECKPOINT}
+```
+
+**Training on CPU**: The process of training on the CPU is consistent with single GPU training if a machine does not have GPU. If it has GPUs but not wanting to use them, we just need to disable GPUs before the training process.
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+```
+
+And then run the script [above](#training-on-a-single-gpu).
+
+### Testing on a single GPU
+
+We provide `tools/test.py` to launch training jobs on a single GPU.
+The basic usage is as follows.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+This tool accepts several optional arguments, including:
+
+- `--work-dir`: If specified, results will be saved in this directory. If not specified, the results will be automatically saved to `work_dirs/{CONFIG_NAME}`.
+- `--show`: Show prediction results at runtime, available when `--show-dir` is not specified.
+- `--show-dir`: Directory where painted images will be saved. If specified, the visualized segmentation mask will be saved to the `work_dir/timestamp/show_dir`.
+- `--wait-time`: The interval of show (s), which takes effect when `--show` is activated. Default to 2.
+- `--cfg-options`:  If specified, the key-value pair in xxx=yyy format will be merged into the config file.
+- `--tta`: Test time augmentation option.
+
+**Testing on CPU**: The process of testing on the CPU is consistent with single GPU testing if a machine does not have GPU. If it has GPUs but not wanting to use them, we just need to disable GPUs before the training process.
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+```
+
+then run the script [above](#testing-on-a-single-gpu).
+
+## Training and testing on multiple GPUs and multiple machines
+
+### Training on multiple GPUs
+
+OpenMMLab2.0 implements **distributed** training with `MMDistributedDataParallel`.
+We provide `tools/dist_train.sh` to launch training on multiple GPUs.
+
+The basic usage is as follows:
+
+```shell
+sh tools/dist_train.sh ${CONFIG_FILE} ${GPU_NUM} [optional arguments]
+```
+
+Optional arguments remain the same as stated [above](#training-on-a-single-gpu) and have additional arguments to specify the number of GPUs.
+
+An example:
+
+```shell
+# checkpoints and logs saved in WORK_DIR=work_dirs/pspnet_r50-d8_4xb4-80k_ade20k-512x512/
+# If work_dir is not set, it will be generated automatically.
+sh tools/dist_train.sh configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py 8 --work-dir work_dirs/pspnet_r50-d8_4xb4-80k_ade20k-512x512
+```
+
+**Note**: During training, checkpoints and logs are saved in the same folder structure as the config file under `work_dirs/`. A custom work directory is not recommended since evaluation scripts infer work directories from the config file name. If you want to save your weights somewhere else, please use a symlink, for example:
+
+```shell
+ln -s ${YOUR_WORK_DIRS} ${MMSEG}/work_dirs
+```
+
+### Testing on multiple GPUs
+
+We provide `tools/dist_test.sh` to launch testing on multiple GPUs.
+The basic usage is as follows.
+
+```shell
+sh tools/dist_test.sh ${CONFIG_FILE} ${CHECKPOINT_FILE} ${GPU_NUM} [optional arguments]
+```
+
+Optional arguments remain the same as stated [above](#testing-on-a-single-gpu) and have additional arguments to specify the number of GPUs.
+
+An example:
+
+```shell
+./tools/dist_test.sh configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py \
+    checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth 4
+```
+
+### Launch multiple jobs on a single machine
+
+If you launch multiple jobs on a single machine, e.g., 2 jobs of 4-GPU training on a machine with 8 GPUs, you need to specify different ports (29500 by default) for each job to avoid communication conflict. Otherwise, there will be an error message saying `RuntimeError: Address already in use`.
+If you use `dist_train.sh` to launch training jobs, you can set the port in commands with the environment variable `PORT`.
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 sh tools/dist_train.sh ${CONFIG_FILE} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 sh tools/dist_train.sh ${CONFIG_FILE} 4
+```
+
+### Training with multiple machines
+
+MMSegmentation relies on `torch.distributed` package for distributed training.
+Thus, as a basic usage, one can launch distributed training via PyTorch's [launch utility](https://pytorch.org/docs/stable/distributed.html#launch-utility).
+
+If you launch with multiple machines simply connected with ethernet, you can simply run the following commands:
+On the first machine:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=${MASTER_PORT} MASTER_ADDR=${MASTER_ADDR} sh tools/dist_train.sh ${CONFIG_FILE} ${GPUS}
+```
+
+On the second machine:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=${MASTER_PORT} MASTER_ADDR=${MASTER_ADDR} sh tools/dist_train.sh ${CONFIG_FILE} ${GPUS}
+```
+
+Usually, it is slow if you do not have high-speed networking like InfiniBand.
+
+## Manage jobs with Slurm
+
+[Slurm](https://slurm.schedmd.com/) is a good job scheduling system for computing clusters.
+
+### Training on a cluster with Slurm
+
+On a cluster managed by Slurm, you can use `slurm_train.sh` to spawn training jobs. It supports both single-node and multi-node training.
+
+The basic usage is as follows:
+
+```shell
+[GPUS=${GPUS}] sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} [optional arguments]
+```
+
+Below is an example of using 4 GPUs to train PSPNet on a Slurm partition named _dev_, and set the work-dir to some shared file systems.
+
+```shell
+GPUS=4 sh tools/slurm_train.sh dev pspnet configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py --work-dir work_dir/pspnet
+```
+
+You can check [the source code](../../../tools/slurm_train.sh) to review full arguments and environment variables.
+
+### Testing on a cluster with Slurm
+
+Similar to the training task, MMSegmentation provides `slurm_test.sh` to launch testing jobs.
+
+The basic usage is as follows:
+
+```shell
+[GPUS=${GPUS}] sh tools/slurm_test.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} ${CHECKPOINT_FILE} [optional arguments]
+```
+
+You can check [the source code](../../../tools/slurm_test.sh) to review full arguments and environment variables.
+
+**Note:** When using Slurm, the port option needs to be set in one of the following ways:
+
+1. Set the port through `--cfg-options`. This is more recommended since it does not change the original configs.
+
+   ```shell
+   GPUS=4 GPUS_PER_NODE=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR} --cfg-options env_cfg.dist_cfg.port=29500
+   GPUS=4 GPUS_PER_NODE=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR} --cfg-options env_cfg.dist_cfg.port=29501
+   ```
+
+2. Modify the config files to set different communication ports.
+   In `config1.py`:
+
+   ```python
+   enf_cfg = dict(dist_cfg=dict(backend='nccl', port=29500))
+   ```
+
+   In `config2.py`:
+
+   ```python
+   enf_cfg = dict(dist_cfg=dict(backend='nccl', port=29501))
+   ```
+
+   Then you can launch two jobs with config1.py and config2.py.
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR}
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR}
+   ```
+
+3. Set the port in the command using the environment variable 'MASTER_PORT':
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 MASTER_PORT=29500 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py ${WORK_DIR}
+CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 MASTER_PORT=29501 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py ${WORK_DIR}
+```
+
+## Testing and saving segment files
+
+### Basic Usage
+
+When you want to save the results, you can use `--out` to specify the output directory.
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --out ${OUTPUT_DIR}
+```
+
+Here is an example to save the predicted results from model `fcn_r50-d8_4xb4-80k_ade20k-512x512` on ADE20k validatation dataset.
+
+```shell
+python tools/test.py configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py ckpt/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth --out work_dirs/format_results
+```
+
+You also can modify the config file to define `output_dir`. We also take
+`fcn_r50-d8_4xb4-80k_ade20k-512x512` as example just add
+`test_evaluator` in `configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py`
+
+```python
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'], output_dir='work_dirs/format_results')
+```
+
+then run command without `--out`:
+
+```shell
+python tools/test.py configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py ckpt/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth
+```
+
+If you would like to only save the predicted results without evaluation as annotation is not released by the official dataset, you can set `format_only=True` and modify `test_dataloader`.
+As there is no annotation in dataset, we remove `dict(type='LoadAnnotations')` from `test_dataloader` Here is the example configuration:
+
+```python
+test_evaluator = dict(
+    type='IoUMetric',
+    iou_metrics=['mIoU'],
+    format_only=True,
+    output_dir='work_dirs/format_results')
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type = 'ADE20KDataset'
+        data_root='data/ade/release_test',
+        data_prefix=dict(img_path='testing'),
+        # we don't load annotation in test transform pipeline.
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+            dict(type='PackSegInputs')
+        ]))
+```
+
+then run test command:
+
+```shell
+python tools/test.py configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py ckpt/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth
+```
+
+### Testing Cityscape dataset and save predicted segment files
+
+We recommend `CityscapesMetric` which is the wrapper of Cityscapes'sdk, when you want to
+save the predicted results of Cityscape test dataset to submit them in [Cityscape test server](https://www.cityscapes-dataset.com/submit/). Here is the example configuration:
+
+```python
+test_evaluator = dict(
+    type='CityscapesMetric',
+    format_only=True,
+    keep_results=True,
+    output_dir='work_dirs/format_results')
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CityscapesDataset',
+        data_root='data/cityscapes/',
+        data_prefix=dict(img_path='leftImg8bit/test'),
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+            dict(type='PackSegInputs')
+        ]))
+```
+
+then run test command, for example:
+
+```shell
+python tools/test.py configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py ckpt/fcn_r18-d8_512x1024_80k_cityscapes_20201225_021327-6c50f8b4.pth
+```
diff --git a/docs/en/user_guides/5_deployment.md b/docs/en/user_guides/5_deployment.md
new file mode 100644
index 0000000000..b3b8f571d0
--- /dev/null
+++ b/docs/en/user_guides/5_deployment.md
@@ -0,0 +1,255 @@
+# Tutorial 5: Model Deployment
+
+# MMSegmentation Model Deployment
+
+- [Tutorial 5: Model Deployment](#tutorial-5-model-deployment)
+- [MMSegmentation Model Deployment](#mmsegmentation-model-deployment)
+  - [Installation](#installation)
+    - [Install mmseg](#install-mmseg)
+    - [Install mmdeploy](#install-mmdeploy)
+  - [Convert model](#convert-model)
+  - [Model specification](#model-specification)
+  - [Model inference](#model-inference)
+    - [Backend model inference](#backend-model-inference)
+    - [SDK model inference](#sdk-model-inference)
+  - [Supported models](#supported-models)
+  - [Note](#note)
+
+______________________________________________________________________
+
+[MMSegmentation](https://github.com/open-mmlab/mmsegmentation/tree/main), also known as `mmseg`, is an open source semantic segmentation toolbox based on Pytorch. It's a part of the [OpenMMLab](<(https://openmmlab.com/)>) object.
+
+## Installation
+
+### Install mmseg
+
+Please follow the [Installation Guide](https://mmsegmentation.readthedocs.io/en/latest/get_started.html).
+
+### Install mmdeploy
+
+`mmdeploy` can be installed as follows:
+
+**Option 1:** Install precompiled package
+
+Please follow the [Installation overview](https://mmdeploy.readthedocs.io/zh_CN/latest/get_started.html#mmdeploy)
+
+**Option 2:**  Automatic Installation script
+
+If the deployment platform is **Ubuntu 18.04 +**, please follow the [scription installation](../01-how-to-build/build_from_script.md) to install.
+For example, the following commands describe how to install mmdeploy and inference engine-`ONNX Runtime`.
+
+```shell
+git clone --recursive -b main https://github.com/open-mmlab/mmdeploy.git
+cd mmdeploy
+python3 tools/scripts/build_ubuntu_x64_ort.py $(nproc)
+export PYTHONPATH=$(pwd)/build/lib:$PYTHONPATH
+export LD_LIBRARY_PATH=$(pwd)/../mmdeploy-dep/onnxruntime-linux-x64-1.8.1/lib/:$LD_LIBRARY_PATH
+```
+
+**NOTE**:
+
+- Add `$(pwd)/build/lib` to `PYTHONPATH`, can loading mmdeploy SDK python package `mmdeploy_runtime`. See [SDK model inference](#SDK-model-inference) for more information.
+- With [ONNX Runtime model inference](#Backend-model-inference), need to load custom operator library and add ONNX Runtime Library's PATH to `LD_LIBRARY_PATH`.
+
+**Option 3:**  Install with mim
+
+1. Use mim to install mmcv
+
+```shell
+pip install -U openmim
+mim install "mmcv>=2.0.0rc2"
+```
+
+2. Install mmdeploy
+
+```shell
+git clone https://github.com/open-mmlab/mmdeploy.git
+cd mmdeploy
+mim install -e .
+```
+
+**Option 4:**  Build MMDeploy from source
+
+If the first three methods aren't suitable, please [Build MMDeploy from source](<(../01-how-to-build/build_from_source.md)>)
+
+## Convert model
+
+[tools/deploy.py](https://github.com/open-mmlab/mmdeploy/tree/main/tools/deploy.py) can convert mmseg Model to backend model conveniently. See [this](https://github.com/open-mmlab/mmdeploy/tree/main/docs/en/02-how-to-run/convert_model.md#usage) for detailed information.
+
+Then convert `unet` to onnx model as follows:
+
+```shell
+cd mmdeploy
+
+# download unet model from mmseg model zoo
+mim download mmsegmentation --config unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024 --dest .
+
+# convert mmseg model to onnxruntime model with dynamic shape
+python tools/deploy.py \
+    configs/mmseg/segmentation_onnxruntime_dynamic.py \
+    unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py \
+    fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204-6860854e.pth \
+    demo/resources/cityscapes.png \
+    --work-dir mmdeploy_models/mmseg/ort \
+    --device cpu \
+    --show \
+    --dump-info
+```
+
+It is crucial to specify the correct deployment config during model conversion. MMDeploy has already provided builtin deployment config [files](https://github.com/open-mmlab/mmdeploy/tree/main/configs/mmseg) of all supported backends for mmsegmentation, under which the config file path follows the pattern:
+
+```
+segmentation_{backend}-{precision}_{static | dynamic}_{shape}.py
+```
+
+- **{backend}:** inference backend, such as onnxruntime, tensorrt, pplnn, ncnn, openvino, coreml etc.
+- **{precision}:** fp16, int8. When it's empty, it means fp32
+- **{static | dynamic}:** static shape or dynamic shape
+- **{shape}:** input shape or shape range of a model
+
+Therefore, in the above example, you can also convert `unet` to tensorrt-fp16 model by `segmentation_tensorrt-fp16_dynamic-512x1024-2048x2048.py`.
+
+```{tip}
+When converting mmsegmentation models to tensorrt models, --device should be set to "cuda"
+```
+
+## Model specification
+
+Before moving on to model inference chapter, let's know more about the converted model structure which is very important for model inference.
+
+The converted model locates in the working directory like `mmdeploy_models/mmseg/ort` in the previous example. It includes:
+
+```
+mmdeploy_models/mmseg/ort
+├── deploy.json
+├── detail.json
+├── end2end.onnx
+└── pipeline.json
+```
+
+in which,
+
+- **end2end.onnx**: backend model which can be inferred by ONNX Runtime
+- ***xxx*.json**: the necessary information for mmdeploy SDK
+
+The whole package **mmdeploy_models/mmseg/ort** is defined as **mmdeploy SDK model**, i.e., **mmdeploy SDK model** includes both backend model and inference meta information.
+
+## Model inference
+
+### Backend model inference
+
+Take the previous converted `end2end.onnx` model as an example, you can use the following code to inference the model and visualize the results:
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg = 'configs/mmseg/segmentation_onnxruntime_dynamic.py'
+model_cfg = './unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py'
+device = 'cpu'
+backend_model = ['./mmdeploy_models/mmseg/ort/end2end.onnx']
+image = './demo/resources/cityscapes.png'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='./output_segmentation.png')
+```
+
+### SDK model inference
+
+You can also perform SDK model inference like following:
+
+```python
+from mmdeploy_runtime import Segmentor
+import cv2
+import numpy as np
+
+img = cv2.imread('./demo/resources/cityscapes.png')
+
+# create a classifier
+segmentor = Segmentor(model_path='./mmdeploy_models/mmseg/ort', device_name='cpu', device_id=0)
+# perform inference
+seg = segmentor(img)
+
+# visualize inference result
+## random a palette with size 256x3
+palette = np.random.randint(0, 256, size=(256, 3))
+color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+for label, color in enumerate(palette):
+    color_seg[seg == label, :] = color
+# convert to BGR
+color_seg = color_seg[..., ::-1]
+img = img * 0.5 + color_seg * 0.5
+img = img.astype(np.uint8)
+cv2.imwrite('output_segmentation.png', img)
+```
+
+Besides python API, mmdeploy SDK also provides other FFI (Foreign Function Interface), such as C, C++, C#, Java and so on. You can learn their usage from [demo](https://github.com/open-mmlab/mmdeploy/tree/main/demo)
+
+## Supported models
+
+| Model                                                                                                     | TorchScript | OnnxRuntime | TensorRT | ncnn | PPLNN | OpenVino |
+| :-------------------------------------------------------------------------------------------------------- | :---------: | :---------: | :------: | :--: | :---: | :------: |
+| [FCN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/fcn)                                 |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [PSPNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/pspnet)[\*](#static_shape)        |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [DeepLabV3](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/deeplabv3)                     |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [DeepLabV3+](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/deeplabv3plus)                |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [Fast-SCNN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/fastscnn)[\*](#static_shape)   |      Y      |      Y      |    Y     |  N   |   Y   |    Y     |
+| [UNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/unet)                               |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [ANN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/ann)[\*](#static_shape)              |      Y      |      Y      |    Y     |  N   |   N   |    N     |
+| [APCNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/apcnet)                           |      Y      |      Y      |    Y     |  Y   |   N   |    N     |
+| [BiSeNetV1](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/bisenetv1)                     |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [BiSeNetV2](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/bisenetv2)                     |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [CGNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/cgnet)                             |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [DMNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/dmnet)                             |      ?      |      Y      |    N     |  N   |   N   |    N     |
+| [DNLNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/dnlnet)                           |      ?      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [EMANet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/emanet)                           |      Y      |      Y      |    Y     |  N   |   N   |    Y     |
+| [EncNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/encnet)                           |      Y      |      Y      |    Y     |  N   |   N   |    Y     |
+| [ERFNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/erfnet)                           |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [FastFCN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/fastfcn)                         |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [GCNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/gcnet)                             |      Y      |      Y      |    Y     |  N   |   N   |    N     |
+| [ICNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/icnet)[\*](#static_shape)          |      Y      |      Y      |    Y     |  N   |   N   |    Y     |
+| [ISANet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/isanet)[\*](#static_shape)        |      N      |      Y      |    Y     |  N   |   N   |    Y     |
+| [NonLocal Net](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/nonlocal_net)               |      ?      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [OCRNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/ocrnet)                           |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [PointRend](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/point_rend)[\*](#static_shape) |      Y      |      Y      |    Y     |  N   |   N   |    N     |
+| [Semantic FPN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/sem_fpn)                    |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [STDC](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/stdc)                               |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [UPerNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/upernet)[\*](#static_shape)      |      N      |      Y      |    Y     |  N   |   N   |    N     |
+| [DANet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/danet)                             |      ?      |      Y      |    Y     |  N   |   N   |    Y     |
+| [Segmenter](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/segmenter)[\*](#static_shape)  |      N      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [SegFormer](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/segformer)[\*](#static_shape)  |      ?      |      Y      |    Y     |  N   |   N   |    Y     |
+| [SETR](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/setr)                               |      ?      |      Y      |    N     |  N   |   N   |    Y     |
+| [CCNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/ccnet)                             |      ?      |      N      |    N     |  N   |   N   |    N     |
+| [PSANet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/psanet)                           |      ?      |      N      |    N     |  N   |   N   |    N     |
+| [DPT](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/dpt)                                 |      ?      |      N      |    N     |  N   |   N   |    N     |
+
+## Note
+
+- All mmseg models only support the 'whole' inference mode.
+
+- <i id=“static_shape”>PSPNet，Fast-SCNN</i> only supports static input, because most inference framework's [nn.AdaptiveAvgPool2d](https://github.com/open-mmlab/mmsegmentation/blob/0c87f7a0c9099844eff8e90fa3db5b0d0ca02fee/mmseg/models/decode_heads/psp_head.py#L38) don't support dynamic input。
+
+- For models that only support static shapes, should use the static shape deployment config file, such as `configs/mmseg/segmentation_tensorrt_static-1024x2048.py`
+
+- To deploy models to generate probabilistic feature maps, please add `codebase_config = dict(with_argmax=False)` to deployment config file.
diff --git a/docs/en/user_guides/index.rst b/docs/en/user_guides/index.rst
new file mode 100644
index 0000000000..1feb1271ae
--- /dev/null
+++ b/docs/en/user_guides/index.rst
@@ -0,0 +1,21 @@
+Train & Test
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   1_config.md
+   2_dataset_prepare.md
+   3_inference.md
+   4_train_test.md
+
+Useful Tools
+*************
+
+.. toctree::
+   :maxdepth: 2
+
+   visualization.md
+   useful_tools.md
+   deployment.md
+   visualization_feature_map.md
diff --git a/docs/en/user_guides/useful_tools.md b/docs/en/user_guides/useful_tools.md
new file mode 100644
index 0000000000..0d8677854b
--- /dev/null
+++ b/docs/en/user_guides/useful_tools.md
@@ -0,0 +1,245 @@
+# \[WIP\] Useful Tools
+
+Apart from training/testing scripts, We provide lots of useful tools under the
+`tools/` directory.
+
+## Analysis Tools
+
+### Plot training logs
+
+`tools/analyze_logs.py` plots loss/mIoU curves given a training log file. `pip install seaborn` first to install the dependency.
+
+```shell
+python tools/analysis_tools/analyze_logs.py xxx.json [--keys ${KEYS}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}]
+```
+
+Examples:
+
+- Plot the mIoU, mAcc, aAcc metrics.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py log.json --keys mIoU mAcc aAcc --legend mIoU mAcc aAcc
+  ```
+
+- Plot loss metric.
+
+  ```shell
+  python tools/analysis_tools/analyze_logs.py log.json --keys loss --legend loss
+  ```
+
+### Confusion Matrix (experimental)
+
+In order to generate and plot a `nxn` confusion matrix where `n` is the number of classes, you can follow the steps:
+
+#### 1.Generate a prediction result in pkl format using `test.py`
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} [--out ${PATH_TO_RESULT_FILE}]
+```
+
+Example:
+
+```shell
+python tools/test.py \
+configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py \
+checkpoint/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth \
+--out result/pred_result.pkl
+```
+
+#### 2. Use `confusion_matrix.py` to generate and plot a confusion matrix
+
+```shell
+python tools/confusion_matrix.py ${CONFIG_FILE} ${PATH_TO_RESULT_FILE} ${SAVE_DIR} --show
+```
+
+Description of arguments:
+
+- `config`: Path to the test config file.
+- `prediction_path`: Path to the prediction .pkl result.
+- `save_dir`: Directory where confusion matrix will be saved.
+- `--show`: Enable result visualize.
+- `--color-theme`: Theme of the matrix color map.
+- `--cfg_options`: Custom options to replace the config file.
+
+Example:
+
+```shell
+python tools/confusion_matrix.py \
+configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py \
+result/pred_result.pkl \
+result/confusion_matrix \
+--show
+```
+
+### Get the FLOPs and params (experimental)
+
+We provide a script adapted from [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch) to compute the FLOPs and params of a given model.
+
+```shell
+python tools/analysis_tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
+```
+
+You will get the result like this.
+
+```none
+==============================
+Input shape: (3, 2048, 1024)
+Flops: 1429.68 GMac
+Params: 48.98 M
+==============================
+```
+
+:::{note}
+This tool is still experimental and we do not guarantee that the number is correct. You may well use the result for simple comparisons, but double check it before you adopt it in technical reports or papers.
+:::
+
+(1) FLOPs are related to the input shape while parameters are not. The default input shape is (1, 3, 1280, 800).
+(2) Some operators are not counted into FLOPs like GN and custom operators.
+
+## Miscellaneous
+
+### Publish a model
+
+Before you upload a model to AWS, you may want to
+(1) convert model weights to CPU tensors, (2) delete the optimizer states and
+(3) compute the hash of the checkpoint file and append the hash id to the filename.
+
+```shell
+python tools/misc/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+E.g.,
+
+```shell
+python tools/publish_model.py work_dirs/pspnet/latest.pth psp_r50_512x1024_40k_cityscapes.pth
+```
+
+The final output filename will be `psp_r50_512x1024_40k_cityscapes-{hash id}.pth`.
+
+### Print the entire config
+
+`tools/misc/print_config.py` prints the whole config verbatim, expanding all its
+imports.
+
+```shell
+python tools/misc/print_config.py \
+  ${CONFIG} \
+  --graph \
+  --cfg-options ${OPTIONS [OPTIONS...]} \
+```
+
+Description of arguments:
+
+- `config` : The path of a pytorch model config file.
+- `--graph` : Determines whether to print the models graph.
+- `--cfg-options`: Custom options to replace the config file.
+
+## Model conversion
+
+`tools/model_converters/` provide several scripts to convert pretrain models released by other repos to MMSegmentation style.
+
+### ViT Swin MiT Transformer Models
+
+- ViT
+
+  `tools/model_converters/vit2mmseg.py` convert keys in timm pretrained vit models to MMSegmentation style.
+
+  ```shell
+  python tools/model_converters/vit2mmseg.py ${SRC} ${DST}
+  ```
+
+- Swin
+
+  `tools/model_converters/swin2mmseg.py` convert keys in official pretrained swin models to MMSegmentation style.
+
+  ```shell
+  python tools/model_converters/swin2mmseg.py ${SRC} ${DST}
+  ```
+
+- SegFormer
+
+  `tools/model_converters/mit2mmseg.py` convert keys in official pretrained mit models to MMSegmentation style.
+
+  ```shell
+  python tools/model_converters/mit2mmseg.py ${SRC} ${DST}
+  ```
+
+## Model Serving
+
+In order to serve an `MMSegmentation` model with [`TorchServe`](https://pytorch.org/serve/), you can follow the steps:
+
+### 1. Convert model from MMSegmentation to TorchServe
+
+```shell
+python tools/torchserve/mmseg2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \
+--output-folder ${MODEL_STORE} \
+--model-name ${MODEL_NAME}
+```
+
+:::{note}
+${MODEL_STORE} needs to be an absolute path to a folder.
+:::
+
+### 2. Build `mmseg-serve` docker image
+
+```shell
+docker build -t mmseg-serve:latest docker/serve/
+```
+
+### 3. Run `mmseg-serve`
+
+Check the official docs for [running TorchServe with docker](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment).
+
+In order to run in GPU, you need to install [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). You can omit the `--gpus` argument in order to run in CPU.
+
+Example:
+
+```shell
+docker run --rm \
+--cpus 8 \
+--gpus device=0 \
+-p8080:8080 -p8081:8081 -p8082:8082 \
+--mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \
+mmseg-serve:latest
+```
+
+[Read the docs](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md) about the Inference (8080), Management (8081) and Metrics (8082) APIs
+
+### 4. Test deployment
+
+```shell
+curl -O https://raw.githubusercontent.com/open-mmlab/mmsegmentation/master/resources/3dogs.jpg
+curl http://127.0.0.1:8080/predictions/${MODEL_NAME} -T 3dogs.jpg -o 3dogs_mask.png
+```
+
+The response will be a ".png" mask.
+
+You can visualize the output as follows:
+
+```python
+import matplotlib.pyplot as plt
+import mmcv
+plt.imshow(mmcv.imread("3dogs_mask.png", "grayscale"))
+plt.show()
+```
+
+You should see something similar to:
+
+![3dogs_mask](../../resources/3dogs_mask.png)
+
+And you can use `test_torchserve.py` to compare result of torchserve and pytorch, and visualize them.
+
+```shell
+python tools/torchserve/test_torchserve.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME}
+[--inference-addr ${INFERENCE_ADDR}] [--result-image ${RESULT_IMAGE}] [--device ${DEVICE}]
+```
+
+Example:
+
+```shell
+python tools/torchserve/test_torchserve.py \
+demo/demo.png \
+configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py \
+checkpoint/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth \
+fcn
+```
diff --git a/docs/en/user_guides/visualization.md b/docs/en/user_guides/visualization.md
new file mode 100644
index 0000000000..e7c3359cc9
--- /dev/null
+++ b/docs/en/user_guides/visualization.md
@@ -0,0 +1,174 @@
+# Visualization
+
+MMSegmentation 1.x provides convenient ways for monitoring training status or visualizing data and model predictions.
+
+## Training status Monitor
+
+MMSegmentation 1.x uses TensorBoard to monitor training status.
+
+### TensorBoard Configuration
+
+Install TensorBoard following [official instructions](https://www.tensorflow.org/install) e.g.
+
+```shell
+pip install tensorboardX
+pip install future tensorboard
+```
+
+Add `TensorboardVisBackend` in `vis_backend` of `visualizer` in `default_runtime.py` config file:
+
+```python
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
+
+### Examining scalars in TensorBoard
+
+Launch training experiment e.g.
+
+```shell
+python tools/train.py configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py --work-dir work_dir/test_visual
+```
+
+Find the `vis_data` path of `work_dir` after starting training, for example, the vis_data path of this particular test is as follows:
+
+```shell
+work_dirs/test_visual/20220810_115248/vis_data
+```
+
+The scalar file in vis_data path includes learning rate, losses and data_time etc, also record metrics results and you can refer [logging tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/logging.html) in MMEngine to log custom data. The tensorboard visualization results are executed with the following command:
+
+```shell
+tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
+```
+
+## Data and Results visualization
+
+### Visualizer Data Samples during Model Testing or Validation
+
+MMSegmentation provides `SegVisualizationHook` which is a [hook](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/hook.md) working to visualize ground truth and prediction of segmentation during model testing and evaluation. Its configuration is in `default_hooks`, please see [Runner tutorial](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/runner.md) for more details.
+
+For example, In `_base_/schedules/schedule_20k.py`, modify the `SegVisualizationHook` configuration, set `draw` to `True` to enable the storage of network inference results, `interval` indicates the sampling interval of the prediction results, and when set to 1, each inference result of the network will be saved. `interval` is set to 50 by default:
+
+```python
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook', draw=True, interval=1))
+
+```
+
+After launch training experiment, visualization results will be stored in the local folder in validation loop,
+or when launch evaluation a model on one dataset, the prediction results will be store in the local.
+The stored results of the local visualization are kept in `vis_image` under `$WORK_DIRS/vis_data`, e.g.:
+
+```shell
+work_dirs/test_visual/20220810_115248/vis_data/vis_image
+```
+
+In addition, if `TensorboardVisBackend` is add in `vis_backends`, like [above](#tensorboard-configuration),
+we can also run the following command to view them in TensorBoard:
+
+```shell
+tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
+```
+
+### Visualize a Single Data Sample
+
+If you want to visualize a single data sample, we suggest to use `SegLocalVisualizer`.
+
+`SegLocalVisualizer` is child class inherits from `Visualizer` in MMEngine and works for MMSegmentation visualization, for more details about `Visualizer` please refer to [visualization tutorial](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/visualization.md) in MMEngine.
+
+Here is an example about `SegLocalVisualizer`, first you may download example data below by following commands:
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png" width="70%"/>
+</div>
+
+```shell
+wget https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png --output-document aachen_000000_000019_leftImg8bit.png
+wget https://user-images.githubusercontent.com/24582831/189833143-15f60f8a-4d1e-4cbb-a6e7-5e2233869fac.png --output-document aachen_000000_000019_gtFine_labelTrainIds.png
+```
+
+Then you can find their local path and use the scripts below to visualize:
+
+```python
+import mmcv
+import os.path as osp
+import torch
+# `PixelData` is data structure for pixel-level annotations or predictions defined in MMEngine.
+# Please refer to below tutorial file of data structures in MMEngine:
+# https://github.com/open-mmlab/mmengine/tree/main/docs/en/advanced_tutorials/data_element.md
+
+from mmengine.structures import PixelData
+
+# `SegDataSample` is data structure interface between different components
+# defined in MMSegmentation, it includes ground truth, prediction and
+# predicted logits of semantic segmentation.
+# Please refer to below tutorial file of `SegDataSample` for more details:
+# https://github.com/open-mmlab/mmsegmentation/blob/1.x/docs/en/advanced_guides/structures.md
+
+from mmseg.structures import SegDataSample
+from mmseg.visualization import SegLocalVisualizer
+
+out_file = 'out_file_cityscapes'
+save_dir = './work_dirs'
+
+image = mmcv.imread(
+    osp.join(
+        osp.dirname(__file__),
+        './aachen_000000_000019_leftImg8bit.png'
+    ),
+    'color')
+sem_seg = mmcv.imread(
+    osp.join(
+        osp.dirname(__file__),
+        './aachen_000000_000019_gtFine_labelTrainIds.png'  # noqa
+    ),
+    'unchanged')
+sem_seg = torch.from_numpy(sem_seg)
+gt_sem_seg_data = dict(data=sem_seg)
+gt_sem_seg = PixelData(**gt_sem_seg_data)
+data_sample = SegDataSample()
+data_sample.gt_sem_seg = gt_sem_seg
+
+seg_local_visualizer = SegLocalVisualizer(
+    vis_backends=[dict(type='LocalVisBackend')],
+    save_dir=save_dir)
+
+# The meta information of dataset usually includes `classes` for class names and
+# `palette` for visualization color of each foreground.
+# All class names and palettes are defined in the file:
+# https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/utils/class_names.py
+
+seg_local_visualizer.dataset_meta = dict(
+    classes=('road', 'sidewalk', 'building', 'wall', 'fence',
+             'pole', 'traffic light', 'traffic sign',
+             'vegetation', 'terrain', 'sky', 'person', 'rider',
+             'car', 'truck', 'bus', 'train', 'motorcycle',
+             'bicycle'),
+    palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70],
+             [102, 102, 156], [190, 153, 153], [153, 153, 153],
+             [250, 170, 30], [220, 220, 0], [107, 142, 35],
+             [152, 251, 152], [70, 130, 180], [220, 20, 60],
+             [255, 0, 0], [0, 0, 142], [0, 0, 70],
+             [0, 60, 100], [0, 80, 100], [0, 0, 230],
+             [119, 11, 32]])
+# When `show=True`, the results would be shown directly,
+# else if `show=False`, the results would be saved in local directory folder.
+seg_local_visualizer.add_datasample(out_file, image,
+                                    data_sample, show=False)
+```
+
+Then the visualization result of image with its corresponding ground truth could be found in `./work_dirs/vis_data/vis_image/` whose name is `out_file_cityscapes_0.png`:
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189835713-c0534054-4bfa-4b75-9254-0afbeb5ff02e.png" width="70%"/>
+</div>
+
+If you would like to know more visualization usage, you can refer to [visualization tutorial](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/visualization.html) in MMEngine.
diff --git a/docs/en/user_guides/visualization_feature_map.md b/docs/en/user_guides/visualization_feature_map.md
new file mode 100644
index 0000000000..08398e514a
--- /dev/null
+++ b/docs/en/user_guides/visualization_feature_map.md
@@ -0,0 +1,201 @@
+# Wandb Feature Map Visualization
+
+MMSegmentation 1.x provides backend support for Weights & Biases to facilitate visualization and management of project code results.
+
+## Wandb Configuration
+
+Install Weights & Biases following [official instructions](https://docs.wandb.ai/quickstart) e.g.
+
+```shell
+pip install wandb
+wandb login
+```
+
+Add `WandbVisBackend` in `vis_backend` of `visualizer` in `default_runtime.py` config file:
+
+```python
+vis_backends=[dict(type='LocalVisBackend'),
+              dict(type='TensorboardVisBackend'),
+              dict(type='WandbVisBackend')]
+```
+
+## Examining feature map visualization in Wandb
+
+`SegLocalVisualizer` is child class inherits from `Visualizer` in MMEngine and works for MMSegmentation visualization, for more details about `Visualizer` please refer to [visualization tutorial](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/visualization.md) in MMEngine.
+
+Here is an example about `SegLocalVisualizer`, first you may download example data below by following commands:
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png" width="70%"/>
+</div>
+
+```shell
+wget https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png --output-document aachen_000000_000019_leftImg8bit.png
+wget https://user-images.githubusercontent.com/24582831/189833143-15f60f8a-4d1e-4cbb-a6e7-5e2233869fac.png --output-document aachen_000000_000019_gtFine_labelTrainIds.png
+
+wget https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth
+
+```
+
+```python
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+from typing import Type
+
+import mmcv
+import torch
+import torch.nn as nn
+
+from mmengine.model import revert_sync_batchnorm
+from mmengine.structures import PixelData
+from mmseg.apis import inference_model, init_model
+from mmseg.structures import SegDataSample
+from mmseg.utils import register_all_modules
+from mmseg.visualization import SegLocalVisualizer
+
+
+class Recorder:
+    """record the forward output feature map and save to data_buffer."""
+
+    def __init__(self) -> None:
+        self.data_buffer = list()
+
+    def __enter__(self, ):
+        self._data_buffer = list()
+
+    def record_data_hook(self, model: nn.Module, input: Type, output: Type):
+        self.data_buffer.append(output)
+
+    def __exit__(self, *args, **kwargs):
+        pass
+
+
+def visualize(args, model, recorder, result):
+    seg_visualizer = SegLocalVisualizer(
+        vis_backends=[dict(type='WandbVisBackend')],
+        save_dir='temp_dir',
+        alpha=0.5)
+    seg_visualizer.dataset_meta = dict(
+        classes=model.dataset_meta['classes'],
+        palette=model.dataset_meta['palette'])
+
+    image = mmcv.imread(args.img, 'color')
+
+    seg_visualizer.add_datasample(
+        name='predict',
+        image=image,
+        data_sample=result,
+        draw_gt=False,
+        draw_pred=True,
+        wait_time=0,
+        out_file=None,
+        show=False)
+
+    # add feature map to wandb visualizer
+    for i in range(len(recorder.data_buffer)):
+        feature = recorder.data_buffer[i][0]  # remove the batch
+        drawn_img = seg_visualizer.draw_featmap(
+            feature, image, channel_reduction='select_max')
+        seg_visualizer.add_image(f'feature_map{i}', drawn_img)
+
+    if args.gt_mask:
+        sem_seg = mmcv.imread(args.gt_mask, 'unchanged')
+        sem_seg = torch.from_numpy(sem_seg)
+        gt_mask = dict(data=sem_seg)
+        gt_mask = PixelData(**gt_mask)
+        data_sample = SegDataSample()
+        data_sample.gt_sem_seg = gt_mask
+
+        seg_visualizer.add_datasample(
+            name='gt_mask',
+            image=image,
+            data_sample=data_sample,
+            draw_gt=True,
+            draw_pred=False,
+            wait_time=0,
+            out_file=None,
+            show=False)
+
+    seg_visualizer.add_image('image', image)
+
+
+def main():
+    parser = ArgumentParser(
+        description='Draw the Feature Map During Inference')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('--gt_mask', default=None, help='Path of gt mask file')
+    parser.add_argument('--out-file', default=None, help='Path to output file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--opacity',
+        type=float,
+        default=0.5,
+        help='Opacity of painted segmentation map. In (0, 1] range.')
+    parser.add_argument(
+        '--title', default='result', help='The image identifier.')
+    args = parser.parse_args()
+
+    register_all_modules()
+
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint, device=args.device)
+    if args.device == 'cpu':
+        model = revert_sync_batchnorm(model)
+
+    # show all named module in the model and use it in source list below
+    for name, module in model.named_modules():
+        print(name)
+
+    source = [
+        'decode_head.fusion.stages.0.query_project.activate',
+        'decode_head.context.stages.0.key_project.activate',
+        'decode_head.context.bottleneck.activate'
+    ]
+    source = dict.fromkeys(source)
+
+    count = 0
+    recorder = Recorder()
+    # registry the forward hook
+    for name, module in model.named_modules():
+        if name in source:
+            count += 1
+            module.register_forward_hook(recorder.record_data_hook)
+            if count == len(source):
+                break
+
+    with recorder:
+        # test a single image, and record feature map to data_buffer
+        result = inference_model(model, args.img)
+
+    visualize(args, model, recorder, result)
+
+
+if __name__ == '__main__':
+    main()
+
+```
+
+Save the above code as feature_map_visual.py and execute the  following code in terminal
+
+```shell
+python feature_map_visual.py ${image} ${config} ${checkpoint} [optional args]
+```
+
+e.g
+
+```shell
+python feature_map_visual.py \
+aachen_000000_000019_leftImg8bit.png \
+configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py \
+ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth \
+--gt_mask aachen_000000_000019_gtFine_labelTrainIds.png
+```
+
+The visualized image result and its corresponding feature map will appear in the wandb account.
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/76149310/217520321-647f5bf9-eef2-446d-a9e8-5ca7b621d500.png">
+</div>
diff --git a/docs/zh_cn/advanced_guides/add_datasets.md b/docs/zh_cn/advanced_guides/add_datasets.md
new file mode 100644
index 0000000000..22fbf3462f
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/add_datasets.md
@@ -0,0 +1,199 @@
+# 新增自定义数据集
+
+## 新增自定义数据集
+
+在这里，我们展示如何构建一个新的数据集。
+
+1. 创建一个新文件 `mmseg/datasets/example.py`
+
+   ```python
+   from mmseg.registry import DATASETS
+   from .basesegdataset import BaseSegDataset
+
+
+   @DATASETS.register_module()
+   class ExampleDataset(BaseSegDataset):
+
+       METAINFO = dict(
+           classes=('xxx', 'xxx', ...),
+           palette=[[x, x, x], [x, x, x], ...])
+
+       def __init__(self, aeg1, arg2):
+           pass
+   ```
+
+2. 在 `mmseg/datasets/__init__.py` 中导入模块
+
+   ```python
+   from .example import ExampleDataset
+   ```
+
+3. 通过创建一个新的数据集配置文件 `configs/_base_/datasets/example_dataset.py` 来使用它
+
+   ```python
+   dataset_type = 'ExampleDataset'
+   data_root = 'data/example/'
+   ...
+   ```
+
+4. 在 `mmseg/utils/class_names.py` 中补充数据集元信息
+
+   ```python
+   def example_classes():
+       return [
+           'xxx', 'xxx',
+           ...
+       ]
+
+   def example_palette():
+       return [
+           [x, x, x], [x, x, x],
+           ...
+       ]
+   dataset_aliases ={
+       'example': ['example', ...],
+       ...
+   }
+   ```
+
+**注意：** 如果新数据集不满足 mmseg 的要求，则需要在 `tools/dataset_converters/` 中准备一个数据集预处理脚本
+
+## 通过重新组织数据来定制数据集
+
+最简单的方法是将您的数据集进行转化，并组织成文件夹的形式。
+
+如下的文件结构就是一个例子。
+
+```none
+├── data
+│   ├── my_dataset
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   │   ├── xxx{img_suffix}
+│   │   │   │   ├── yyy{img_suffix}
+│   │   │   │   ├── zzz{img_suffix}
+│   │   │   ├── val
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   │   ├── xxx{seg_map_suffix}
+│   │   │   │   ├── yyy{seg_map_suffix}
+│   │   │   │   ├── zzz{seg_map_suffix}
+│   │   │   ├── val
+
+```
+
+一个训练对将由 img_dir/ann_dir 里同样首缀的文件组成。
+
+有些数据集不会发布测试集或测试集的标注，如果没有测试集的标注，我们就无法在本地进行评估模型，因此我们在配置文件中将验证集设置为默认测试集。
+
+关于如何构建自己的数据集或实现新的数据集类，请参阅[数据集指南](./datasets.md)以获取更多详细信息。
+
+**注意：** 标注是跟图像同样的形状 (H, W)，其中的像素值的范围是 `[0, num_classes - 1]`。
+您也可以使用 [pillow](https://pillow.readthedocs.io/en/stable/handbook/concepts.html#palette) 的 `'P'` 模式去创建包含颜色的标注。
+
+## 通过混合数据去定制数据集
+
+MMSegmentation 同样支持混合数据集去训练。
+当前它支持拼接 (concat), 重复 (repeat) 和多图混合 (multi-image mix) 数据集。
+
+### 重复数据集
+
+我们使用 `RepeatDataset` 作为包装 (wrapper) 去重复数据集。
+例如，假设原始数据集是 `Dataset_A`，为了重复它，配置文件如下：
+
+```python
+dataset_A_train = dict(
+    type='RepeatDataset',
+    times=N,
+    dataset=dict(  # 这是 Dataset_A 数据集的原始配置
+        type='Dataset_A',
+        ...
+        pipeline=train_pipeline
+    )
+)
+```
+
+### 拼接数据集
+
+如果要拼接不同的数据集，可以按如下方式连接数据集配置。
+
+```python
+dataset_A_train = dict()
+dataset_B_train = dict()
+concatenate_dataset = dict(
+    type='ConcatDataset',
+    datasets=[dataset_A_train, dataset_B_train])
+```
+
+下面是一个更复杂的示例，它分别重复 `Dataset_A` 和 `Dataset_B` N 次和 M 次，然后连接重复的数据集。
+
+```python
+dataset_A_train = dict(
+    type='RepeatDataset',
+    times=N,
+    dataset=dict(
+        type='Dataset_A',
+        ...
+        pipeline=train_pipeline
+    )
+)
+dataset_A_val = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_A_test = dict(
+    ...
+    pipeline=test_pipeline
+)
+dataset_B_train = dict(
+    type='RepeatDataset',
+    times=M,
+    dataset=dict(
+        type='Dataset_B',
+        ...
+        pipeline=train_pipeline
+    )
+)
+train_dataloader = dict(
+    dataset=dict(
+        type='ConcatDataset',
+        datasets=[dataset_A_train, dataset_B_train]))
+
+val_dataloader = dict(dataset=dataset_A_val)
+test_dataloader = dict(dataset=dataset_A_test)
+
+```
+
+您可以参考 mmengine 的基础数据集[教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/basedataset.html)以了解更多详细信息
+
+### 多图混合集
+
+我们使用 `MultiImageMixDataset` 作为包装（wrapper）去混合多个数据集的图片。
+`MultiImageMixDataset`可以被类似 mosaic 和 mixup 的多图混合数据増广使用。
+
+`MultiImageMixDataset` 与 `Mosaic` 数据増广一起使用的例子：
+
+```python
+train_pipeline = [
+    dict(type='RandomMosaic', prob=1),
+    dict(type='Resize', img_scale=(1024, 512), keep_ratio=True),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackSegInputs')
+]
+
+train_dataset = dict(
+    type='MultiImageMixDataset',
+    dataset=dict(
+        type=dataset_type,
+        reduce_zero_label=False,
+        img_dir=data_root + "images/train",
+        ann_dir=data_root + "annotations/train",
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations'),
+        ]
+    ),
+    pipeline=train_pipeline
+)
+
+```
diff --git a/docs/zh_cn/advanced_guides/add_metrics.md b/docs/zh_cn/advanced_guides/add_metrics.md
new file mode 100644
index 0000000000..0637b44728
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/add_metrics.md
@@ -0,0 +1,81 @@
+# 新增评测指标
+
+## 使用 MMSegmentation 的源代码进行开发
+
+在这里，我们用 `CustomMetric` 作为例子来展示如何开发一个新的评测指标。
+
+1. 创建一个新文件 `mmseg/evaluation/metrics/custom_metric.py`。
+
+   ```python
+   from typing import List, Sequence
+
+   from mmengine.evaluator import BaseMetric
+
+   from mmseg.registry import METRICS
+
+
+   @METRICS.register_module()
+   class CustomMetric(BaseMetric):
+
+       def __init__(self, arg1, arg2):
+           """
+           The metric first processes each batch of data_samples and predictions,
+           and appends the processed results to the results list. Then it
+           collects all results together from all ranks if distributed training
+           is used. Finally, it computes the metrics of the entire dataset.
+           """
+
+       def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+           pass
+
+       def compute_metrics(self, results: list) -> dict:
+           pass
+
+       def evaluate(self, size: int) -> dict:
+           pass
+   ```
+
+   在上面的示例中，`CustomMetric` 是 `BaseMetric` 的子类。它有三个方法：`process`，`compute_metrics` 和 `evaluate`。
+
+   - `process()` 处理一批数据样本和预测。处理后的结果需要显示地传给 `self.results` ，将在处理所有数据样本后用于计算指标。更多细节请参考 [MMEngine 文档](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/design/evaluation.md)
+
+   - `compute_metrics()` 用于从处理后的结果中计算指标。
+
+   - `evaluate()` 是一个接口，用于计算指标并返回结果。它将由 `ValLoop` 或 `TestLoop` 在 `Runner` 中调用。在大多数情况下，您不需要重写此方法，但如果您想做一些额外的工作，可以重写它。
+
+   **注意：** 您可以在[这里](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L366) 找到 `Runner` 调用 `evaluate()` 方法的过程。`Runner` 是训练和测试过程的执行器，您可以在[训练引擎文档](./engine.md)中找到有关它的详细信息。
+
+2. 在 `mmseg/evaluation/metrics/__init__.py` 中导入新的指标。
+
+   ```python
+   from .custom_metric import CustomMetric
+   __all__ = ['CustomMetric', ...]
+   ```
+
+3. 在配置文件中设置新的评测指标
+
+   ```python
+   val_evaluator = dict(type='CustomMetric', arg1=xxx, arg2=xxx)
+   test_evaluator = dict(type='CustomMetric', arg1=xxx, arg2=xxx)
+   ```
+
+## 使用发布版本的 MMSegmentation 进行开发
+
+上面的示例展示了如何使用 MMSegmentation 的源代码开发新指标。如果您想使用 MMSegmentation 的发布版本开发新指标，可以按照以下步骤操作。
+
+1. 创建一个新文件 `/Path/to/metrics/custom_metric.py`，实现 `process`，`compute_metrics` 和 `evaluate` 方法，`evaluate` 方法是可选的。
+
+2. 在代码或配置文件中导入新的指标。
+
+   ```python
+   from path.to.metrics import CustomMetric
+   ```
+
+   或者
+
+   ```python
+   custom_imports = dict(imports=['/Path/to/metrics'], allow_failed_imports=False)
+
+   val_evaluator = dict(type='CustomMetric', arg1=xxx, arg2=xxx)
+   test_evaluator = dict(type='CustomMetric', arg1=xxx, arg2=xxx)
+   ```
diff --git a/docs/zh_cn/advanced_guides/add_models.md b/docs/zh_cn/advanced_guides/add_models.md
new file mode 100644
index 0000000000..e05c07c8ba
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/add_models.md
@@ -0,0 +1,260 @@
+# 新增模块
+
+## 开发新组件
+
+我们可以自定义 [模型文档](./models.md) 中介绍的所有组件，例如**主干网络（backbone）**、**头（head）**、**损失函数（loss function）**和**数据预处理器（data preprocessor）**。
+
+### 添加新的主干网络（backbone）
+
+在这里，我们以 MobileNet 为例展示如何开发新的主干网络。
+
+1. 创建一个新文件 `mmseg/models/backbones/mobilenet.py`。
+
+   ```python
+   import torch.nn as nn
+
+   from mmseg.registry import MODELS
+
+
+   @MODELS.register_module()
+   class MobileNet(nn.Module):
+
+       def __init__(self, arg1, arg2):
+           pass
+
+       def forward(self, x):  # should return a tuple
+           pass
+
+       def init_weights(self, pretrained=None):
+           pass
+   ```
+
+2. 在 `mmseg/models/backbones/__init__.py` 中引入模块。
+
+   ```python
+   from .mobilenet import MobileNet
+   ```
+
+3. 在配置文件中使用它。
+
+   ```python
+   model = dict(
+       ...
+       backbone=dict(
+           type='MobileNet',
+           arg1=xxx,
+           arg2=xxx),
+       ...
+   ```
+
+### 添加新的头（head）
+
+在 MMSegmentation 中，我们提供 [BaseDecodeHead](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/models/decode_heads/decode_head.py#L17) 用于开发所有分割头。
+所有新实现的解码头都应该从中派生出来。
+接下来我们以 [PSPNet](https://arxiv.org/abs/1612.01105) 为例说明如何开发新的头。
+
+首先，在 `mmseg/models/decode_heads/psp_head.py` 中添加一个新的解码头。
+PSPNet 实现了用于分割解码的解码头。
+为了实现解码头，在新模块中我们需要执行以下三个函数。
+
+```python
+from mmseg.registry import MODELS
+
+@MODELS.register_module()
+class PSPHead(BaseDecodeHead):
+
+    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
+        super(PSPHead, self).__init__(**kwargs)
+
+    def init_weights(self):
+        pass
+
+    def forward(self, inputs):
+        pass
+```
+
+接下来，用户需要在 `mmseg/models/decode_heads/__init__.py` 中添加模块，这样相应的注册器就可以找到并加载它们。
+
+PSPNet 的配置文件如下
+
+```python
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='pretrain_model/resnet50_v1c_trick-2cccc1ad.pth',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
+
+```
+
+### 添加新的损失函数（loss）
+
+假设您想为分割解码添加一个叫做 `MyLoss` 的新的损失函数。
+要添加新的损失函数，用户需要在 `mmseg/models/loss/my_loss.py` 中实现它。
+修饰器 `weighted_loss` 可以对损失的每个元素进行加权。
+
+```python
+import torch
+import torch.nn as nn
+
+from mmseg.registry import MODELS
+from .utils import weighted_loss
+
+@weighted_loss
+def my_loss(pred, target):
+    assert pred.size() == target.size() and target.numel() > 0
+    loss = torch.abs(pred - target)
+    return loss
+
+@MODELS.register_module()
+class MyLoss(nn.Module):
+
+    def __init__(self, reduction='mean', loss_weight=1.0):
+        super(MyLoss, self).__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+
+    def forward(self,
+                pred,
+                target,
+                weight=None,
+                avg_factor=None,
+                reduction_override=None):
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        loss = self.loss_weight * my_loss(
+            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
+        return loss
+```
+
+然后，用户需要将其添加到 `mmseg/models/loss/__init__.py` 中。
+
+```python
+from .my_loss import MyLoss, my_loss
+
+```
+
+要使用它，请修改 `loss_xx` 字段。
+然后需要修改头中的 `loss_decode` 字段。
+`loss_weight` 可用于平衡多重损失。
+
+```python
+loss_decode=dict(type='MyLoss', loss_weight=1.0))
+```
+
+### 添加新的数据预处理器（data preprocessor）
+
+在 MMSegmentation 1.x 版本中，我们使用 [SegDataPreProcessor](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/data_preprocessor.py#L13) 将数据复制到目标设备，并将数据预处理为默认的模型输入格式。这里我们将展示如何开发一个新的数据预处理器。
+
+1. 创建一个新文件 `mmseg/models/my_datapreprocessor.py`。
+
+   ```python
+   from mmengine.model import BaseDataPreprocessor
+
+   from mmseg.registry import MODELS
+
+   @MODELS.register_module()
+   class MyDataPreProcessor(BaseDataPreprocessor):
+       def __init__(self, **kwargs):
+           super().__init__(**kwargs)
+
+       def forward(self, data: dict, training: bool=False) -> Dict[str, Any]:
+           # TODO Define the logic for data pre-processing in the forward method
+           pass
+   ```
+
+2. 在 `mmseg/models/__init__.py` 中导入数据预处理器
+
+   ```python
+   from .my_datapreprocessor import MyDataPreProcessor
+   ```
+
+3. 在配置文件中使用它。
+
+   ```python
+   model = dict(
+       data_preprocessor=dict(type='MyDataPreProcessor)
+       ...
+   )
+   ```
+
+## 开发新的分割器（segmentor）
+
+分割器是一种户可以通过添加自定义组件和定义算法执行逻辑来自定义其算法的算法架构。请参考[模型文档](./models.md)了解更多详情。
+
+由于 MMSegmentation 中的 [BaseSegmenter](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/models/segmentors/base.py#L15) 统一了前向过程的三种模式，为了开发新的分割器，用户需要重写与 `loss`、`predict` 和 `tensor` 相对应的 `loss`、`predict` 和 `_forward` 方法。
+
+这里我们将展示如何开发一个新的分割器。
+
+1. 创建一个新文件 `mmseg/models/segmentors/my_segmentor.py`。
+
+   ```python
+    from typing import Dict, Optional, Union
+
+    import torch
+
+    from mmseg.registry import MODELS
+    from mmseg.models import BaseSegmentor
+
+    @MODELS.register_module()
+    class MySegmentor(BaseSegmentor):
+        def __init__(self, **kwargs):
+            super().__init__(**kwargs)
+            # TODO users should build components of the network here
+
+        def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+            """Calculate losses from a batch of inputs and data samples."""
+            pass
+
+        def predict(self, inputs: Tensor, data_samples: OptSampleList=None) -> SampleList:
+            """Predict results from a batch of inputs and data samples with post-
+            processing."""
+            pass
+
+       def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+            """Network forward process.
+
+            Usually includes backbone, neck and head forward without any post-
+            processing.
+            """
+            pass
+   ```
+
+2. 在 `mmseg/models/segmentors/__init__.py` 中导入分割器。
+
+   ```python
+   from .my_segmentor import MySegmentor
+   ```
+
+3. 在配置文件中使用它。
+
+   ```python
+   model = dict(
+       type='MySegmentor'
+       ...
+   )
+   ```
diff --git a/docs/zh_cn/advanced_guides/add_transforms.md b/docs/zh_cn/advanced_guides/add_transforms.md
new file mode 100644
index 0000000000..d7206680d3
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/add_transforms.md
@@ -0,0 +1,51 @@
+# 新增数据增强
+
+## 自定义数据增强
+
+自定义数据增强必须继承 `BaseTransform` 并实现 `transform` 函数。这里我们使用一个简单的翻转变换作为示例：
+
+```python
+import random
+import mmcv
+from mmcv.transforms import BaseTransform, TRANSFORMS
+
+@TRANSFORMS.register_module()
+class MyFlip(BaseTransform):
+    def __init__(self, direction: str):
+        super().__init__()
+        self.direction = direction
+
+    def transform(self, results: dict) -> dict:
+        img = results['img']
+        results['img'] = mmcv.imflip(img, direction=self.direction)
+        return results
+```
+
+此外，新的类需要被导入。
+
+```python
+from .my_pipeline import MyFlip
+```
+
+这样，我们就可以实例化一个 `MyFlip` 对象并使用它来处理数据字典。
+
+```python
+import numpy as np
+
+transform = MyFlip(direction='horizontal')
+data_dict = {'img': np.random.rand(224, 224, 3)}
+data_dict = transform(data_dict)
+processed_img = data_dict['img']
+```
+
+或者，我们可以在配置文件中的数据流程中使用 `MyFlip` 变换。
+
+```python
+pipeline = [
+    ...
+    dict(type='MyFlip', direction='horizontal'),
+    ...
+]
+```
+
+需要注意，如果要在配置文件中使用 `MyFlip`，必须确保在运行时导入了包含 `MyFlip` 的文件。
diff --git a/docs/zh_cn/advanced_guides/contribute_dataset.md b/docs/zh_cn/advanced_guides/contribute_dataset.md
new file mode 100644
index 0000000000..4222de32a6
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/contribute_dataset.md
@@ -0,0 +1,461 @@
+# 在 mmsegmentation projects 中贡献一个标准格式的数据集
+
+- 在开始您的贡献流程前，请先阅读[《OpenMMLab 贡献代码指南》](https://mmcv.readthedocs.io/zh_CN/latest/community/contributing.html)，以详细的了解 OpenMMLab 代码库的代码贡献流程。
+- 该教程以 [Gaofen Image Dataset (GID)](https://www.sciencedirect.com/science/article/pii/S0034425719303414) 高分 2 号卫星所拍摄的遥感图像语义分割数据集作为样例，来演示在 mmsegmentation 中的数据集贡献流程。
+
+## 步骤 1： 配置 mmsegmentation 开发所需必要环境
+
+- 开发所必需的环境安装请参考[中文快速入门指南](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/get_started.md)或[英文 get_started](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/get_started.md)。
+
+- 如果您已安装了最新版的 pytorch、mmcv、mmengine，那么您可以跳过步骤 1 至[步骤 2](<#[步骤-2](#%E6%AD%A5%E9%AA%A4-2%E4%BB%A3%E7%A0%81%E8%B4%A1%E7%8C%AE%E5%89%8D%E7%9A%84%E5%87%86%E5%A4%87%E5%B7%A5%E4%BD%9C)>)。
+
+- **注：** 在此处无需安装 mmsegmentation，只需安装开发 mmsegmentation 所必需的 pytorch、mmcv、mmengine 等即可。
+
+**新建虚拟环境（如已有合适的开发环境，可跳过）**
+
+- 从[官方网站](https://docs.conda.io/en/latest/miniconda.html)下载并安装 Miniconda
+- 创建一个 conda 环境，并激活
+
+```shell
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
+```
+
+**安装 pytorch （如环境下已安装 pytorch，可跳过）**
+
+- 参考 [official instructions](https://pytorch.org/get-started/locally/) 安装 **PyTorch**
+
+**使用 mim 安装 mmcv、mmengine**
+
+- 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMCV](https://github.com/open-mmlab/mmcv)
+
+```shell
+pip install -U openmim
+mim install mmengine
+mim install "mmcv>=2.0.0"
+```
+
+## 步骤 2：代码贡献前的准备工作
+
+### 2.1 Fork mmsegmentation 仓库
+
+- 通过浏览器打开[mmsegmentation 官方仓库](https://github.com/open-mmlab/mmsegmentation/tree/main)。
+- 登录您的 GitHub 账户，以下步骤均需在 GitHub 登录的情况下进行。
+- Fork mmsegmentation 仓库
+  ![image](https://user-images.githubusercontent.com/50650583/233825567-b8bf273c-38f5-4487-b4c6-75ede1e283ee.png)
+- Fork 之后，mmsegmentation 仓库将会出现在您的个人仓库中。
+
+### 2.2 在您的代码编写软件中 git clone mmsegmentation
+
+这里以 VSCODE 为例
+
+- 打开 VSCODE，新建终端窗口并激活您在[步骤 1 ](#%E6%AD%A5%E9%AA%A4-1-%E9%85%8D%E7%BD%AE-mmsegmentation-%E5%BC%80%E5%8F%91%E6%89%80%E9%9C%80%E5%BF%85%E8%A6%81%E7%8E%AF%E5%A2%83)中所安装的虚拟环境。
+- 在您 GitHub 的个人仓库中找到您 Fork 的 mmsegmentation 仓库，复制其链接。
+  ![image](https://github.com/AI-Tianlong/OpenMMLabCamp/assets/50650583/92ad555b-c5b2-4a7f-a800-ebee1e405ab6)
+- 在终端中执行命令
+  ```bash
+  git clone {您所复制的个人仓库的链接}
+  ```
+  ![image](https://github.com/AI-Tianlong/OpenMMLabCamp/assets/50650583/23ba2636-e66f-4ea5-9077-9dd6b69deb1d)
+  **注：** 如提示以下信息，请在 GitHub 中添加 [SSH 秘钥](https://docs.github.com/en/authentication/connecting-to-github-with-ssh/generating-a-new-ssh-key-and-adding-it-to-the-ssh-agent)
+  ![image](https://github.com/AI-Tianlong/OpenMMLabCamp/assets/50650583/6fcab213-0739-483c-b345-c59656027377)
+- 进入 mmsegmentation 目录（之后的操作均在 mmsegmentation 目录下）。
+  ```bash
+  cd mmsegmentation
+  ```
+- 在终端中执行以下命令，添加官方仓库为上游仓库。
+  ```bash
+  git remote add upstream git@github.com:open-mmlab/mmsegmentation.git
+  ```
+- 使用以下命令检查 remote 是否添加成功。
+  ```bash
+  git remote -v
+  ```
+  ![image](https://github.com/AI-Tianlong/OpenMMLabCamp/assets/50650583/beec7e5e-2b00-4e49-ab38-f0c79e346594)
+
+### 2.3 切换目录至 mmsegmentation 并从源码安装mmsegmentation
+
+在`mmsegmentation`目录下执行`pip install -v -e .`，通过源码构建方式安装 mmsegmentaion 库。
+安装完成后，您将能看到如下图所示的文件树。
+<img src="https://user-images.githubusercontent.com/50650583/233826064-4b111358-8f97-44dd-955c-df3204410b8b.png" alt="image" style="zoom:67%;" />
+
+### 2.4 切换分支为 dev-1.x
+
+正如您在[ mmsegmentation 官网](https://github.com/open-mmlab/mmsegmentation/tree/main)所见，该仓库有许多分支，默认分支`main`为稳定的发行版本，以及用于贡献者进行开发的`dev-1.x`分支。`dev-1.x`分支是贡献者们用来提交创意和 PR 的分支，`dev-1.x`分支的内容会被周期性的合入到`main`分支。
+![image](https://user-images.githubusercontent.com/50650583/233826225-f4b7299d-de23-47db-900d-dfb01ba0efc3.png)
+
+回到 VSCODE 中，在终端执行命令
+
+```bash
+git checkout dev-1.x
+```
+
+### 2.5 创新属于自己的新分支
+
+在基于`dev-1.x`分支下，使用如下命令，创建属于您自己的分支。
+
+```bash
+# git checkout -b 您的GitHubID/您的分支想要实现的功能的名字
+# git checkout -b AI-Tianlong/support_GID_dataset
+git checkout -b {您的GitHubID/您的分支想要实现的功能的名字}
+```
+
+### 2.6 配置 pre-commit
+
+OpenMMLab 仓库对代码质量有着较高的要求，所有提交的 PR 必须要通过代码格式检查。pre-commit 详细配置参阅[配置 pre-commit](https://mmcv.readthedocs.io/zh_CN/latest/community/contributing.html#pre-commit)。
+
+## 步骤 3：在`mmsegmentation/projects`下贡献您的代码
+
+**先对 GID 数据集进行分析**
+
+这里以贡献高分 2 号遥感图像语义分割数据集 GID 为例，GID 数据集是由我国自主研发的高分 2 号卫星所拍摄的光学遥感图像所创建，经图像预处理后共提供了 150 张 6800x7200 像素的 RGB 三通道遥感图像。并提供了两种不同类别数的数据标注，一种是包含 5 类有效物体的 RGB 标签，另一种是包含 15 类有效物体的 RGB 标签。本教程将针对 5 类标签进行数据集贡献流程讲解。
+
+GID 的 5 类有效标签分别为：0-背景-\[0,0,0\](mask 标签值-标签名称-RGB 标签值)、1-建筑-\[255,0,0\]、2-农田-\[0,255,0\]、3-森林-\[0,0,255\]、4-草地-\[255,255,0\]、5-水-\[0,0,255\]。在语义分割任务中，标签是与原图尺寸一致的单通道图像，标签图像中的像素值为真实样本图像中对应像素所包含的物体的类别。GID 数据集提供的是具有 RGB 三通道的彩色标签，为了模型的训练需要将 RGB 标签转换为 mask 标签。并且由于图像尺寸为 6800x7200 像素，对于神经网络的训练来有些过大，所以将每张图像裁切成了没有重叠的 512x512 的图像以便进行训练。
+<img align='center' src="https://user-images.githubusercontent.com/50650583/234192183-83ee4209-e181-4a18-90ca-4d71757cd2c7.png" alt="image" style="zoom:67%;" />
+
+### 3.1 在`mmsegmentation/projects`下创建新的项目文件夹
+
+在`mmsegmentation/projects`下创建文件夹`gid_dataset`
+![image](https://user-images.githubusercontent.com/50650583/233829687-8f2b6600-bc9d-48ff-a865-d462af54d55a.png)
+
+### 3.2 贡献您的数据集代码
+
+为了最终能将您在 projects 中贡献的代码更加顺畅的移入核心库中（对代码要求质量更高），非常建议按照核心库的目录来编辑您的数据集文件。
+关于数据集有 4 个必要的文件：
+
+- **1**  `mmseg/datasets/gid.py` 定义了数据集的尾缀、CLASSES、PALETTE、reduce_zero_label等
+- **2** `configs/_base_/gid.py` GID 数据集的配置文件，定义了数据集的`dataset_type`（数据集类型，`mmseg/datasets/gid.py`中注册的数据集的类名）、`data_root`(数据集所在的根目录，建议将数据集通过软连接的方式将数据集放至`mmsegmentation/data`)、`train_pipline`(训练的数据流)、`test_pipline`(测试和验证时的数据流)、`img_rations`(多尺度预测时的多尺度配置)、`tta_pipeline`（多尺度预测）、`train_dataloader`(训练集的数据加载器)、`val_dataloader`(验证集的数据加载器)、`test_dataloader`(测试集的数据加载器)、`val_evaluator`(验证集的评估器)、`test_evaluator`(测试集的评估器)。
+- **3** 使用了 GID 数据集的模型训练配置文件
+  这个是可选的，但是强烈建议您添加。在核心库中，所贡献的数据集需要和参考文献中所提出的结果精度对齐，为了后期将您贡献的代码合并入核心库。如您的算力充足，最好能提供对应的模型配置文件在您贡献的数据集上所验证的结果以及相应的权重文件，并撰写较为详细的README.md文档。[示例参考结果](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/deeplabv3plus#mapillary-vistas-v12)
+  ![image](https://user-images.githubusercontent.com/50650583/233877682-eabe8723-bce9-40e4-a303-08c8385cb6b5.png)
+- **4** 使用如下命令格式： 撰写`docs/zh_cn/user_guides/2_dataset_prepare.md`来添加您的数据集介绍，包括但不限于数据集的下载方式，数据集目录结构、数据集生成等一些必要性的文字性描述和运行命令。以更好地帮助用户能更快的实现数据集的准备工作。
+
+### 3.3 贡献`tools/dataset_converters/gid.py`
+
+由于 GID 数据集是由未经过切分的 6800x7200 图像所构成的数据集，并且没有划分训练集、验证集与测试集。以及其标签为 RGB 彩色标签，需要将标签转换为单通道的 mask label。为了方便训练，首先将 GID 数据集进行裁切和标签转换，并进行数据集划分，构建为 mmsegmentation 所支持的格式。
+
+```python
+# tools/dataset_converters/gid.py
+import argparse
+import glob
+import math
+import os
+import os.path as osp
+from PIL import Image
+
+import mmcv
+import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert GID dataset to mmsegmentation format')
+    parser.add_argument('dataset_img_path', help='GID images folder path')
+    parser.add_argument('dataset_label_path', help='GID labels folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path', default='data/gid')
+    parser.add_argument(
+        '--clip_size',
+        type=int,
+        help='clipped size of image after preparation',
+        default=256)
+    parser.add_argument(
+        '--stride_size',
+        type=int,
+        help='stride of clipping original images',
+        default=256)
+    args = parser.parse_args()
+    return args
+
+GID_COLORMAP = dict(
+    Background=(0, 0, 0), #0-背景-黑色
+    Building=(255, 0, 0), #1-建筑-红色
+    Farmland=(0, 255, 0), #2-农田-绿色
+    Forest=(0, 0, 255), #3-森林-蓝色
+    Meadow=(255, 255, 0),#4-草地-黄色
+    Water=(0, 0, 255)#5-水-蓝色
+)
+palette = list(GID_COLORMAP.values())
+classes = list(GID_COLORMAP.keys())
+
+#############用列表来存一个 RGB 和一个类别的对应################
+def colormap2label(palette):
+    colormap2label_list = np.zeros(256**3, dtype = np.longlong)
+    for i, colormap in enumerate(palette):
+        colormap2label_list[(colormap[0] * 256 + colormap[1])*256+colormap[2]] = i
+    return colormap2label_list
+
+#############给定那个列表，和vis_png然后生成masks_png################
+def label_indices(RGB_label, colormap2label_list):
+    RGB_label = RGB_label.astype('int32')
+    idx = (RGB_label[:, :, 0] * 256 + RGB_label[:, :, 1]) * 256 + RGB_label[:, :, 2]
+    # print(idx.shape)
+    return colormap2label_list[idx]
+
+def RGB2mask(RGB_label, colormap2label_list):
+    # RGB_label = np.array(Image.open(RGB_label).convert('RGB')) #打开RGB_png
+    mask_label = label_indices(RGB_label, colormap2label_list) # .numpy()
+    return mask_label
+
+colormap2label_list = colormap2label(palette)
+
+def clip_big_image(image_path, clip_save_dir, args, to_label=False):
+    """
+    Original image of GID dataset is very large, thus pre-processing
+    of them is adopted. Given fixed clip size and stride size to generate
+    clipped image, the intersection　of width and height is determined.
+    For example, given one 6800 x 7200 original image, the clip size is
+    256 and stride size is 256, thus it would generate 29 x 27 = 783 images
+    whose size are all 256 x 256.
+
+    """
+
+    image = mmcv.imread(image_path, channel_order='rgb')
+    # image = mmcv.bgr2gray(image)
+
+    h, w, c = image.shape
+    clip_size = args.clip_size
+    stride_size = args.stride_size
+
+    num_rows = math.ceil((h - clip_size) / stride_size) if math.ceil(
+        (h - clip_size) /
+        stride_size) * stride_size + clip_size >= h else math.ceil(
+            (h - clip_size) / stride_size) + 1
+    num_cols = math.ceil((w - clip_size) / stride_size) if math.ceil(
+        (w - clip_size) /
+        stride_size) * stride_size + clip_size >= w else math.ceil(
+            (w - clip_size) / stride_size) + 1
+
+    x, y = np.meshgrid(np.arange(num_cols + 1), np.arange(num_rows + 1))
+    xmin = x * clip_size
+    ymin = y * clip_size
+
+    xmin = xmin.ravel()
+    ymin = ymin.ravel()
+    xmin_offset = np.where(xmin + clip_size > w, w - xmin - clip_size,
+                           np.zeros_like(xmin))
+    ymin_offset = np.where(ymin + clip_size > h, h - ymin - clip_size,
+                           np.zeros_like(ymin))
+    boxes = np.stack([
+        xmin + xmin_offset, ymin + ymin_offset,
+        np.minimum(xmin + clip_size, w),
+        np.minimum(ymin + clip_size, h)
+    ], axis=1)
+
+    if to_label:
+        image = RGB2mask(image, colormap2label_list) #这里得改一下
+
+    for count, box in enumerate(boxes):
+        start_x, start_y, end_x, end_y = box
+        clipped_image = image[start_y:end_y,
+                              start_x:end_x] if to_label else image[
+                                  start_y:end_y, start_x:end_x, :]
+        img_name = osp.basename(image_path).replace('.tif', '')
+        img_name = img_name.replace('_label', '')
+        if count % 3 == 0:
+            mmcv.imwrite(
+                clipped_image.astype(np.uint8),
+                osp.join(
+                    clip_save_dir.replace('train', 'val'),
+                    f'{img_name}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+        else:
+            mmcv.imwrite(
+                clipped_image.astype(np.uint8),
+                osp.join(
+                    clip_save_dir,
+                    f'{img_name}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+        count += 1
+
+def main():
+    args = parse_args()
+
+    """
+    According to this paper: https://ieeexplore.ieee.org/document/9343296/
+    select 15 images contained in GID, , which cover the whole six
+    categories, to generate train set and validation set.
+
+    According to Paper: https://ieeexplore.ieee.org/document/9343296/
+
+    """
+
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'gid')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+
+    src_path_list = glob.glob(os.path.join(args.dataset_img_path, '*.tif'))
+    print(f'Find {len(src_path_list)} pictures')
+
+    prog_bar = ProgressBar(len(src_path_list))
+
+    dst_img_dir = osp.join(out_dir, 'img_dir', 'train')
+    dst_label_dir = osp.join(out_dir, 'ann_dir', 'train')
+
+    for i, img_path in enumerate(src_path_list):
+        label_path = osp.join(args.dataset_label_path, osp.basename(img_path.replace('.tif', '_label.tif')))
+
+        clip_big_image(img_path, dst_img_dir, args, to_label=False)
+        clip_big_image(label_path, dst_label_dir, args, to_label=True)
+        prog_bar.update()
+
+    print('Done!')
+
+if __name__ == '__main__':
+    main()
+```
+
+### 3.4 贡献`mmseg/datasets/gid.py`
+
+可参考[`projects/mapillary_dataset/mmseg/datasets/mapillary.py`](https://github.com/open-mmlab/mmsegmentation/blob/main/projects/mapillary_dataset/mmseg/datasets/mapillary.py)并在此基础上修改相应变量以适配您的数据集。
+
+```python
+# mmseg/datasets/gid.py
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.datasets.basesegdataset import BaseSegDataset
+from mmseg.registry import DATASETS
+
+# 注册数据集类
+@DATASETS.register_module()
+class GID_Dataset(BaseSegDataset):
+    """Gaofen Image Dataset (GID)
+
+    Dataset paper link:
+    https://www.sciencedirect.com/science/article/pii/S0034425719303414
+    https://x-ytong.github.io/project/GID.html
+
+    GID  6 classes: background(others), built-up, farmland, forest, meadow, water
+
+    In This example, select 10 images from GID dataset as training set,
+    and select 5 images as validation set.
+    The selected images are listed as follows:
+
+    GF2_PMS1__L1A0000647767-MSS1
+    GF2_PMS1__L1A0001064454-MSS1
+    GF2_PMS1__L1A0001348919-MSS1
+    GF2_PMS1__L1A0001680851-MSS1
+    GF2_PMS1__L1A0001680853-MSS1
+    GF2_PMS1__L1A0001680857-MSS1
+    GF2_PMS1__L1A0001757429-MSS1
+    GF2_PMS2__L1A0000607681-MSS2
+    GF2_PMS2__L1A0000635115-MSS2
+    GF2_PMS2__L1A0000658637-MSS2
+    GF2_PMS2__L1A0001206072-MSS2
+    GF2_PMS2__L1A0001471436-MSS2
+    GF2_PMS2__L1A0001642620-MSS2
+    GF2_PMS2__L1A0001787089-MSS2
+    GF2_PMS2__L1A0001838560-MSS2
+
+    The ``img_suffix`` is fixed to '.tif' and ``seg_map_suffix`` is
+    fixed to '.tif' for GID.
+    """
+    METAINFO = dict(
+        classes=('Others', 'Built-up', 'Farmland', 'Forest',
+                 'Meadow', 'Water'),
+
+        palette=[[0, 0, 0], [255, 0, 0], [0, 255, 0], [0, 255, 255],
+                 [255, 255, 0], [0, 0, 255]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=None,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
+```
+
+### 3.5 贡献使用 GID 的训练 config file
+
+```python
+_base_ = [
+    '../../../configs/_base_/models/deeplabv3plus_r50-d8.py',
+    './_base_/datasets/gid.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_240k.py'
+]
+custom_imports = dict(
+    imports=['projects.gid_dataset.mmseg.datasets.gid'])
+
+crop_size = (256, 256)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101),
+    decode_head=dict(num_classes=6),
+    auxiliary_head=dict(num_classes=6))
+
+```
+
+### 3.6 撰写`docs/zh_cn/user_guides/2_dataset_prepare.md`
+
+**Gaofen Image Dataset (GID)**
+
+- GID 数据集可在[此处](https://x-ytong.github.io/project/Five-Billion-Pixels.html)进行下载。
+- GID 数据集包含 150 张 6800x7200 的大尺寸图像，标签为 RGB 标签。
+- 此处选择 15 张图像生成训练集和验证集，该 15 张图像包含了所有六类信息。所选的图像名称如下：
+
+```None
+  GF2_PMS1__L1A0000647767-MSS1
+  GF2_PMS1__L1A0001064454-MSS1
+  GF2_PMS1__L1A0001348919-MSS1
+  GF2_PMS1__L1A0001680851-MSS1
+  GF2_PMS1__L1A0001680853-MSS1
+  GF2_PMS1__L1A0001680857-MSS1
+  GF2_PMS1__L1A0001757429-MSS1
+  GF2_PMS2__L1A0000607681-MSS2
+  GF2_PMS2__L1A0000635115-MSS2
+  GF2_PMS2__L1A0000658637-MSS2
+  GF2_PMS2__L1A0001206072-MSS2
+  GF2_PMS2__L1A0001471436-MSS2
+  GF2_PMS2__L1A0001642620-MSS2
+  GF2_PMS2__L1A0001787089-MSS2
+  GF2_PMS2__L1A0001838560-MSS2
+```
+
+执行以下命令进行裁切及标签的转换，需要修改为您所存储 15 张图像及标签的路径。
+
+```
+python projects/gid_dataset/tools/dataset_converters/gid.py [15 张图像的路径] [15 张标签的路径]
+```
+
+完成裁切后的 GID 数据结构如下：
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── gid
+│   │   ├── ann_dir
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+│   │   ├── img_dir
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+
+```
+
+### 3.7 贡献的代码及文档通过`pre-commit`检查
+
+使用命令
+
+```bash
+git add .
+git commit -m "添加描述"
+git push
+```
+
+### 3.8 在 GitHub 中向 mmsegmentation 提交 PR
+
+具体步骤可见[《OpenMMLab 贡献代码指南》](https://mmcv.readthedocs.io/zh_CN/latest/community/contributing.html)
diff --git a/docs/zh_cn/advanced_guides/customize_runtime.md b/docs/zh_cn/advanced_guides/customize_runtime.md
new file mode 100644
index 0000000000..a80aca6345
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/customize_runtime.md
@@ -0,0 +1,162 @@
+# 自定义运行设定
+
+## 实现自定义钩子
+
+### Step 1: 创建一个新的钩子
+
+MMEngine 已实现了训练和测试常用的[钩子](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/hook.md),
+当有定制化需求时, 可以按照如下示例实现适用于自身训练需求的钩子, 例如想修改一个超参数 `model.hyper_paramete` 的值, 让它随着训练迭代次数而变化:
+
+```python
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Sequence
+
+from mmengine.hooks import Hook
+from mmengine.model import is_model_wrapper
+
+from mmseg.registry import HOOKS
+
+
+@HOOKS.register_module()
+class NewHook(Hook):
+    """Docstring for NewHook.
+    """
+
+    def __init__(self, a: int, b: int) -> None:
+        self.a = a
+        self.b = b
+
+    def before_train_iter(self,
+                          runner,
+                          batch_idx: int,
+                          data_batch: Optional[Sequence[dict]] = None) -> None:
+        cur_iter = runner.iter
+        # 当模型被包在 wrapper 里时获取这个模型
+        if is_model_wrapper(runner.model):
+          model = runner.model.module
+        model.hyper_parameter = self.a * cur_iter + self.b
+```
+
+### Step 2: 导入一个新的钩子
+
+为了让上面定义的模块可以被执行的程序发现, 这个模块需要先被导入主命名空间 (main namespace) 里面,
+假设 NewHook 在 `mmseg/engine/hooks/new_hook.py` 里面, 有两种方式去实现它:
+
+- 修改 `mmseg/engine/hooks/__init__.py` 来导入它.
+  新定义的模块应该在 `mmseg/engine/hooks/__init__.py` 里面导入, 这样注册器可以发现并添加这个新的模块:
+
+```python
+from .new_hook import NewHook
+
+__all__ = [..., NewHook]
+```
+
+- 在配置文件里使用 custom_imports 来手动导入它.
+
+```python
+custom_imports = dict(imports=['mmseg.engine.hooks.new_hook'], allow_failed_imports=False)
+```
+
+### Step 3: 修改配置文件
+
+可以按照如下方式, 在训练或测试中配置并使用自定义的钩子. 不同钩子在同一位点的优先级可以参考[这里](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/hook.md#%E5%86%85%E7%BD%AE%E9%92%A9%E5%AD%90), 自定义钩子如果没有指定优先, 默认是 `NORMAL`.
+
+```python
+custom_hooks = [
+    dict(type='NewHook', a=a_value, b=b_value, priority='ABOVE_NORMAL')
+]
+```
+
+## 实现自定义优化器
+
+### Step 1: 创建一个新的优化器
+
+如果增加一个叫作 `MyOptimizer` 的优化器, 它有参数 `a`, `b` 和 `c`. 推荐在 `mmseg/engine/optimizers/my_optimizer.py` 文件中实现
+
+```python
+from mmseg.registry import OPTIMIZERS
+from torch.optim import Optimizer
+
+
+@OPTIMIZERS.register_module()
+class MyOptimizer(Optimizer):
+
+    def __init__(self, a, b, c)
+```
+
+### Step 2: 导入一个新的优化器
+
+为了让上面定义的模块可以被执行的程序发现, 这个模块需要先被导入主命名空间 (main namespace) 里面,
+假设 `MyOptimizer` 在 `mmseg/engine/optimizers/my_optimizer.py` 里面, 有两种方式去实现它:
+
+- 修改 `mmseg/engine/optimizers/__init__.py` 来导入它.
+  新定义的模块应该在 `mmseg/engine/optimizers/__init__.py` 里面导入, 这样注册器可以发现并添加这个新的模块:
+
+```python
+from .my_optimizer import MyOptimizer
+```
+
+- 在配置文件里使用 `custom_imports` 来手动导入它.
+
+```python
+custom_imports = dict(imports=['mmseg.engine.optimizers.my_optimizer'], allow_failed_imports=False)
+```
+
+### Step 3: 修改配置文件
+
+随后需要修改配置文件 `optim_wrapper` 里的 `optimizer` 参数, 如果要使用你自己的优化器 `MyOptimizer`, 字段可以被修改成:
+
+```python
+optim_wrapper = dict(type='OptimWrapper',
+                     optimizer=dict(type='MyOptimizer',
+                                    a=a_value, b=b_value, c=c_value),
+                     clip_grad=None)
+```
+
+## 实现自定义优化器封装构造器
+
+### Step 1: 创建一个新的优化器封装构造器
+
+构造器可以用来创建优化器, 优化器包, 以及自定义模型网络不同层的超参数. 一些模型的优化器可能会根据特定的参数而调整, 例如 BatchNorm 层的 weight decay. 使用者可以通过自定义优化器构造器来精细化设定不同参数的优化策略.
+
+```python
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmseg.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class LearningRateDecayOptimizerConstructor(DefaultOptimWrapperConstructor):
+    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
+
+    def __call__(self, model):
+
+        return my_optimizer
+```
+
+默认的优化器构造器在[这里](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L19) 被实现, 它也可以用来作为新的优化器构造器的模板.
+
+### Step 2: 导入一个新的优化器封装构造器
+
+为了让上面定义的模块可以被执行的程序发现, 这个模块需要先被导入主命名空间 (main namespace) 里面, 假设 `MyOptimizerConstructor` 在 `mmseg/engine/optimizers/my_optimizer_constructor.py` 里面, 有两种方式去实现它:
+
+- 修改 `mmseg/engine/optimizers/__init__.py` 来导入它.
+  新定义的模块应该在 `mmseg/engine/optimizers/__init__.py` 里面导入, 这样注册器可以发现并添加这个新的模块:
+
+```python
+from .my_optimizer_constructor import MyOptimizerConstructor
+```
+
+- 在配置文件里使用 `custom_imports` 来手动导入它.
+
+```python
+custom_imports = dict(imports=['mmseg.engine.optimizers.my_optimizer_constructor'], allow_failed_imports=False)
+```
+
+### Step 3: 修改配置文件
+
+随后需要修改配置文件 `optim_wrapper`  里的 `constructor` 参数, 如果要使用你自己的优化器封装构造器 `MyOptimizerConstructor`, 字段可以被修改成:
+
+```python
+optim_wrapper = dict(type='OptimWrapper',
+                     constructor='MyOptimizerConstructor',
+                     clip_grad=None)
+```
diff --git a/docs/zh_cn/advanced_guides/data_flow.md b/docs/zh_cn/advanced_guides/data_flow.md
new file mode 100644
index 0000000000..20dbe07e75
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/data_flow.md
@@ -0,0 +1,90 @@
+# 数据流
+
+在本章节中，我们将介绍 [Runner](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html) 管理的内部模块之间的数据流和数据格式约定。
+
+## 数据流概述
+
+[Runner](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/design/runner.md) 相当于 MMEngine 中的“集成器”。它覆盖了框架的所有方面，并肩负着组织和调度几乎所有模块的责任，这意味着各模块之间的数据流也由 `Runner` 控制。 如 [MMEngine 中的 Runner 文档](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html)所示，下图展示了基本的数据流。
+
+![Basic dataflow](https://user-images.githubusercontent.com/112053249/199228350-5f80699e-7fd2-4b4c-ac32-0b16b1922c2e.png)
+
+虚线边框、灰色填充形状代表不同的数据格式，而实心框表示模块/方法。由于 MMEngine 极大的灵活性和可扩展性，一些重要的基类可以被继承，并且它们的方法可以被覆写。 上图所示数据流仅适用于当用户没有自定义 `Runner` 中的 `TrainLoop`、`ValLoop` 和 `TestLoop`，并且没有在其自定义模型中覆写 `train_step`、`val_step` 和 `test_step` 方法时。MMSegmentation 中 loop 的默认设置如下：使用`IterBasedTrainLoop` 训练模型，共计 20000 次迭代，并且在每 2000 次迭代后进行一次验证。
+
+```python
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=20000, val_interval=2000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+```
+
+在上图中，红色线表示 [train_step](./models.md#train_step)，在每次训练迭代中，数据加载器（dataloader）从存储中加载图像并传输到数据预处理器（data preprocessor），数据预处理器会将图像放到特定的设备上，并将数据堆叠到批处理中，之后模型接受批处理数据作为输入，最后将模型的输出发送给优化器（optimizer）。蓝色线表示 [val_step](./models.md#val_step) 和 [test_step](./models.md#test_step)。这两个过程的数据流除了模型输出与 `train_step` 不同外，其余均和 `train_step` 类似。由于在评估时模型参数会被冻结，因此模型的输出将被传递给 [Evaluator](./evaluation.md#ioumetric)。
+来计算指标。
+
+## MMSegmentation 中的数据流约定
+
+在上面的图中，我们可以看到基本的数据流。在本节中，我们将分别介绍数据流中涉及的数据的格式约定。
+
+### 数据加载器到数据预处理器
+
+数据加载器（DataLoader）是 MMEngine 的训练和测试流程中的一个重要组件。
+从概念上讲，它源于 [PyTorch](https://pytorch.org/) 并保持一致。DataLoader 从文件系统加载数据，原始数据通过数据准备流程后被发送给数据预处理器。
+
+MMSegmentation 在 [PackSegInputs](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/transforms/formatting.py#L12) 中定义了默认数据格式， 它是 `train_pipeline` 和 `test_pipeline` 的最后一个组件。有关数据转换 `pipeline` 的更多信息，请参阅[数据转换文档](./transforms.md)。
+
+在没有任何修改的情况下，PackSegInputs 的返回值通常是一个包含 `inputs` 和 `data_samples` 的 `dict`。以下伪代码展示了 mmseg 中数据加载器输出的数据类型，它是从数据集中获取的一批数据样本，数据加载器将它们打包成一个字典列表。`inputs` 是输入进模型的张量列表，`data_samples` 包含了输入图像的 meta information 和相应的 ground truth。
+
+```python
+dict(
+    inputs=List[torch.Tensor],
+    data_samples=List[SegDataSample]
+)
+```
+
+**注意：** [SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py) 是 MMSegmentation 的数据结构接口，用于连接不同组件。`SegDataSample` 实现了抽象数据元素 `mmengine.structures.BaseDataElement`，更多信息请在  [MMEngine](https://github.com/open-mmlab/mmengine) 中参阅 [SegDataSample 文档](./structures.md)和[数据元素文档](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/data_element.html)。
+
+### 数据预处理器到模型
+
+虽然在[上面的图](##数据流概述)中分开绘制了数据预处理器和模型，但数据预处理器是模型的一部分，因此可以在[模型教程](./models.md)中找到数据预处理器章节。
+
+数据预处理器的返回值是一个包含 `inputs` 和 `data_samples` 的字典，其中 `inputs` 是批处理图像的 4D 张量，`data_samples` 中添加了一些用于数据预处理的额外元信息。当传递给网络时，字典将被解包为两个值。 以下伪代码展示了数据预处理器的返回值和模型的输入值。
+
+```python
+dict(
+    inputs=torch.Tensor,
+    data_samples=List[SegDataSample]
+)
+```
+
+```python
+class Network(BaseSegmentor):
+
+    def forward(self, inputs: torch.Tensor, data_samples: List[SegDataSample], mode: str):
+        pass
+```
+
+**注意：** 模型的前向传播有 3 种模式，由输入参数 mode 控制，更多信息请参阅[模型教程](./models.md)。
+
+### 模型输出
+
+如[模型教程](./models.md#forward) ***（[中文链接待更新](./models.md#forward)）*** 所提到的 3 种前向传播具有 3 种输出。
+`train_step` 和 `test_step`（或 `val_step`）分别对应于 `'loss'` 和 `'predict'`。
+
+在 `test_step` 或 `val_step` 中，推理结果会被传递给 `Evaluator` 。您可以参阅[评估文档](./evaluation.md)来获取更多关于 `Evaluator` 的信息。
+
+在推理后，MMSegmentation 中的 [BaseSegmentor](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/segmentors/base.py#L15) 会对推理结果进行简单的后处理以打包推理结果。神经网络生成的分割 logits，经过 `argmax` 操作后的分割 mask 和 ground truth（如果存在）将被打包到类似 `SegDataSample` 的实例。 [postprocess_result](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/segmentors/base.py#L132) 的返回值是一个 **`SegDataSample`的`List`**。下图显示了这些 `SegDataSample` 实例的关键属性。
+
+![SegDataSample](https://user-images.githubusercontent.com/15952744/209912225-ab46a8d9-904a-43cb-8bf1-8bec4938ed29.png)
+
+与数据预处理器一致，损失函数也是模型的一部分，它是[解码头](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/decode_heads/decode_head.py#L142)的属性之一。
+
+在 MMSegmentation 中，`decode_head` 的 [loss_by_feat](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/models/decode_heads/decode_head.py#L291) 方法是用于计算损失的统一接口。
+
+参数：
+
+- seg_logits (Tensor)：解码头前向函数的输出
+- batch_data_samples (List\[SegDataSample\])：分割数据样本，通常包括如 `metainfo` 和  `gt_sem_seg` 等信息
+
+返回值：
+
+- dict\[str, Tensor\]：一个损失组件的字典
+
+**注意：**  `train_step` 将损失传递进 OptimWrapper 以更新模型中的权重，更多信息请参阅 [train_step](./models.md#train_step)。
diff --git a/docs/zh_cn/advanced_guides/datasets.md b/docs/zh_cn/advanced_guides/datasets.md
new file mode 100644
index 0000000000..b45f2d22bb
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/datasets.md
@@ -0,0 +1,363 @@
+# 数据集
+
+在 MMSegmentation 算法库中, 所有 Dataset 类的功能有两个: 加载[预处理](../user_guides/2_dataset_prepare.md) 之后的数据集的信息, 和将数据送入[数据集变换流水线](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/basesegdataset.py#L141) 中, 进行[数据变换操作](./transforms.md). 加载的数据集信息包括两类: 元信息 (meta information), 数据集本身的信息, 例如数据集总共的类别, 和它们对应调色盘信息: 数据信息 (data information) 是指每组数据中图片和对应标签的路径. 下文中介绍了 MMSegmentation 1.x 中数据集的常用接口, 和 mmseg 数据集基类中数据信息加载与修改数据集类别的逻辑, 以及数据集与数据变换流水线 (pipeline) 的关系.
+
+## 常用接口
+
+以 Cityscapes 为例, 介绍数据集常用接口. 如需运行以下示例, 请在当前工作目录下的 `data` 目录下载并[预处理](../user_guides/2_dataset_prepare.md#cityscapes) Cityscapes 数据集.
+
+实例化 Cityscapes 训练数据集:
+
+```python
+from mmengine.registry import init_default_scope
+from mmseg.datasets import CityscapesDataset
+
+init_default_scope('mmseg')
+
+data_root = 'data/cityscapes/'
+data_prefix=dict(img_path='leftImg8bit/train', seg_map_path='gtFine/train')
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='RandomCrop', crop_size=(512, 1024), cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PackSegInputs')
+]
+
+dataset = CityscapesDataset(data_root=data_root, data_prefix=data_prefix, test_mode=False, pipeline=train_pipeline)
+```
+
+查看训练数据集长度:
+
+```python
+print(len(dataset))
+
+2975
+```
+
+获取数据信息, 数据信息的类型是一个字典, 包括 `'img_path'` 字段的存放图片的路径和 `'seg_map_path'` 字段存放分割标注的路径, 以及标签重映射的字段 `'label_map'` 和 `'reduce_zero_label'`(主要功能在下文中介绍), 还有存放已加载标签字段 `'seg_fields'`, 和当前样本的索引字段 `'sample_idx'`.
+
+```python
+# 获取数据集中第一组样本的数据信息
+print(dataset.get_data_info(0))
+
+{'img_path': 'data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png',
+ 'seg_map_path': 'data/cityscapes/gtFine/train/aachen/aachen_000000_000019_gtFine_labelTrainIds.png',
+ 'label_map': None,
+ 'reduce_zero_label': False,
+ 'seg_fields': [],
+ 'sample_idx': 0}
+```
+
+获取数据集元信息, MMSegmentation 的数据集元信息的类型同样是一个字典, 包括 `'classes'` 字段存放数据集类别, `'palette'` 存放数据集类别对应的可视化时调色盘的颜色, 以及标签重映射的字段 `'label_map'` 和 `'reduce_zero_label'`.
+
+```python
+print(dataset.metainfo)
+
+{'classes': ('road',
+  'sidewalk',
+  'building',
+  'wall',
+  'fence',
+  'pole',
+  'traffic light',
+  'traffic sign',
+  'vegetation',
+  'terrain',
+  'sky',
+  'person',
+  'rider',
+  'car',
+  'truck',
+  'bus',
+  'train',
+  'motorcycle',
+  'bicycle'),
+ 'palette': [[128, 64, 128],
+  [244, 35, 232],
+  [70, 70, 70],
+  [102, 102, 156],
+  [190, 153, 153],
+  [153, 153, 153],
+  [250, 170, 30],
+  [220, 220, 0],
+  [107, 142, 35],
+  [152, 251, 152],
+  [70, 130, 180],
+  [220, 20, 60],
+  [255, 0, 0],
+  [0, 0, 142],
+  [0, 0, 70],
+  [0, 60, 100],
+  [0, 80, 100],
+  [0, 0, 230],
+  [119, 11, 32]],
+ 'label_map': None,
+ 'reduce_zero_label': False}
+```
+
+数据集 `__getitem__` 方法的返回值, 是经过数据增强的样本数据的输出, 同样也是一个字典, 包括两个字段, `'inputs'` 字段是当前样本经过数据增强操作的图像, 类型为 torch.Tensor, `'data_samples'` 字段存放的数据类型是 MMSegmentation 1.x 新添加的数据结构 [`Segdatasample`](./structures.md), 其中`gt_sem_seg` 字段是经过数据增强的标签数据.
+
+```python
+print(dataset[0])
+
+{'inputs': tensor([[[131, 130, 130,  ...,  23,  23,  23],
+          [132, 132, 132,  ...,  23,  22,  23],
+          [134, 133, 133,  ...,  23,  23,  23],
+          ...,
+          [ 66,  67,  67,  ...,  71,  71,  71],
+          [ 66,  67,  66,  ...,  68,  68,  68],
+          [ 67,  67,  66,  ...,  70,  70,  70]],
+
+         [[143, 143, 142,  ...,  28,  28,  29],
+          [145, 145, 145,  ...,  28,  28,  29],
+          [145, 145, 145,  ...,  27,  28,  29],
+          ...,
+          [ 75,  75,  76,  ...,  80,  81,  81],
+          [ 75,  76,  75,  ...,  80,  80,  80],
+          [ 77,  76,  76,  ...,  82,  82,  82]],
+
+         [[126, 125, 126,  ...,  21,  21,  22],
+          [127, 127, 128,  ...,  21,  21,  22],
+          [127, 127, 126,  ...,  21,  21,  22],
+          ...,
+          [ 63,  63,  64,  ...,  69,  69,  70],
+          [ 64,  65,  64,  ...,  69,  69,  69],
+          [ 65,  66,  66,  ...,  72,  71,  71]]], dtype=torch.uint8),
+ 'data_samples': <SegDataSample(
+
+     META INFORMATION
+     img_path: 'data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png'
+     seg_map_path: 'data/cityscapes/gtFine/train/aachen/aachen_000000_000019_gtFine_labelTrainIds.png'
+     img_shape: (512, 1024, 3)
+     flip_direction: None
+     ori_shape: (1024, 2048)
+     flip: False
+
+     DATA FIELDS
+     gt_sem_seg: <PixelData(
+
+             META INFORMATION
+
+             DATA FIELDS
+             data: tensor([[[2, 2, 2,  ..., 8, 8, 8],
+                          [2, 2, 2,  ..., 8, 8, 8],
+                          [2, 2, 2,  ..., 8, 8, 8],
+                          ...,
+                          [0, 0, 0,  ..., 0, 0, 0],
+                          [0, 0, 0,  ..., 0, 0, 0],
+                          [0, 0, 0,  ..., 0, 0, 0]]])
+         )>
+     _gt_sem_seg: <PixelData(
+
+             META INFORMATION
+
+             DATA FIELDS
+             data: tensor([[[2, 2, 2,  ..., 8, 8, 8],
+                          [2, 2, 2,  ..., 8, 8, 8],
+                          [2, 2, 2,  ..., 8, 8, 8],
+                          ...,
+                          [0, 0, 0,  ..., 0, 0, 0],
+                          [0, 0, 0,  ..., 0, 0, 0],
+                          [0, 0, 0,  ..., 0, 0, 0]]])
+         )>
+ )}
+```
+
+## BaseSegDataset
+
+由于 MMSegmentation 中的所有数据集的基本功能均包括(1) 加载[数据集预处理](../user_guides/2_dataset_prepare.md) 之后的数据信息和 (2) 将数据送入数据变换流水线中进行数据变换, 因此在 MMSegmentation 中将其中的共同接口抽象成 [`BaseSegDataset`](https://mmsegmentation.readthedocs.io/zh_CN/latest/api.html?highlight=BaseSegDataset#mmseg.datasets.BaseSegDataset)，它继承自 [MMEngine 的 `BaseDataset`](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/basedataset.md), 遵循 OpenMMLab 数据集初始化统一流程, 支持高效的内部数据存储格式, 支持数据集拼接、数据集重复采样等功能.
+在 MMSegmentation BaseSegDataset 中重新定义了**数据信息加载方法**（`load_data_list`）和并新增了 `get_label_map` 方法用来**修改数据集的类别信息**.
+
+### 数据信息加载
+
+数据信息加载的内容是样本数据的图片路径和标签路径, 具体实现在 MMSegmentation 的 BaseSegDataset 的 [`load_data_list`](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/basesegdataset.py#L231) 中.
+主要有两种获取图片和标签的路径方法, 如果当数据集目录按以下目录结构组织, [`load_data_list`](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/basesegdataset.py#L231)) 会根据数据路径和后缀来解析.
+
+```
+├── data
+│   ├── my_dataset
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   │   ├── xxx{img_suffix}
+│   │   │   │   ├── yyy{img_suffix}
+│   │   │   ├── val
+│   │   │   │   ├── zzz{img_suffix}
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   │   ├── xxx{seg_map_suffix}
+│   │   │   │   ├── yyy{seg_map_suffix}
+│   │   │   ├── val
+│   │   │   │   ├── zzz{seg_map_suffix}
+```
+
+例如 ADE20k 数据集结构如下所示:
+
+```
+├── ade
+│   ├── ADEChallengeData2016
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   │   ├── ADE_train_00000001.png
+│   │   │   │   ├── ...
+│   │   │   │── validation
+│   │   │   │   ├── ADE_val_00000001.png
+│   │   │   │   ├── ...
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   │   ├── ADE_train_00000001.jpg
+│   │   │   │   ├── ...
+│   │   │   ├── validation
+│   │   │   │   ├── ADE_val_00000001.jpg
+│   │   │   │   ├── ...
+```
+
+实例化 ADE20k 数据集时，输入图片和标签的路径和后缀:
+
+```python
+from mmseg.datasets import ADE20KDataset
+
+ADE20KDataset(data_root = 'data/ade/ADEChallengeData2016',
+    data_prefix=dict(img_path='images/training', seg_map_path='annotations/training'),
+    img_suffix='.jpg',
+    seg_map_suffix='.png',
+    reduce_zero_label=True）
+```
+
+如果数据集有标注文件, 实例化数据集时会根据输入的数据集标注文件加载数据信息. 例如, PascalContext 数据集实例, 输入标注文件的内容为:
+
+```python
+2008_000008
+...
+```
+
+实例化时需要定义 `ann_file`
+
+```python
+PascalContextDataset(data_root='data/VOCdevkit/VOC2010/',
+    data_prefix=dict(img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
+    ann_file='ImageSets/SegmentationContext/train.txt')
+```
+
+### 数据集类别修改
+
+- 通过输入 metainfo 修改
+  `BaseSegDataset` 的子类元信息在数据集实现时定义为类变量，例如 Cityscapes 的 `METAINFO` 变量:
+
+```python
+class CityscapesDataset(BaseSegDataset):
+    """Cityscapes dataset.
+
+    The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is
+    fixed to '_gtFine_labelTrainIds.png' for Cityscapes dataset.
+    """
+    METAINFO = dict(
+        classes=('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+                 'traffic light', 'traffic sign', 'vegetation', 'terrain',
+                 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train',
+                 'motorcycle', 'bicycle'),
+        palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+                 [190, 153, 153], [153, 153, 153], [250, 170,
+                                                    30], [220, 220, 0],
+                 [107, 142, 35], [152, 251, 152], [70, 130, 180],
+                 [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                 [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
+
+```
+
+这里的 `'classes'` 中定义了 Cityscapes 数据集标签中的类别名, 如果训练时只关注几个交通工具类别, **忽略其他类别**,
+在实例化 Cityscapes 数据集时通过定义 `metainfo` 输入参数的 classes 的字段来修改数据集的元信息:
+
+```python
+from mmseg.datasets import CityscapesDataset
+
+data_root = 'data/cityscapes/'
+data_prefix=dict(img_path='leftImg8bit/train', seg_map_path='gtFine/train')
+# metainfo 中只保留以下 classes
+metainfo=dict(classes=( 'car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle'))
+dataset = CityscapesDataset(data_root=data_root, data_prefix=data_prefix, metainfo=metainfo)
+
+print(dataset.metainfo)
+
+{'classes': ('car', 'truck', 'bus', 'train', 'motorcycle', 'bicycle'),
+ 'palette': [[0, 0, 142],
+  [0, 0, 70],
+  [0, 60, 100],
+  [0, 80, 100],
+  [0, 0, 230],
+  [119, 11, 32],
+  [128, 64, 128],
+  [244, 35, 232],
+  [70, 70, 70],
+  [102, 102, 156],
+  [190, 153, 153],
+  [153, 153, 153],
+  [250, 170, 30],
+  [220, 220, 0],
+  [107, 142, 35],
+  [152, 251, 152],
+  [70, 130, 180],
+  [220, 20, 60],
+  [255, 0, 0]],
+ # 类别索引为 255 的像素，在计算损失时会被忽略
+ 'label_map': {0: 255,
+  1: 255,
+  2: 255,
+  3: 255,
+  4: 255,
+  5: 255,
+  6: 255,
+  7: 255,
+  8: 255,
+  9: 255,
+  10: 255,
+  11: 255,
+  12: 255,
+  13: 0,
+  14: 1,
+  15: 2,
+  16: 3,
+  17: 4,
+  18: 5},
+ 'reduce_zero_label': False}
+```
+
+可以看到, 数据集元信息的类别和默认 Cityscapes 不同. 并且, 定义了标签重映射的字段 `label_map` 用来修改每个分割掩膜上的像素的类别索引, 分割标签类别会根据 `label_map`, 将类别重映射, [具体实现](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/datasets/basesegdataset.py#L151):
+
+```python
+gt_semantic_seg_copy = gt_semantic_seg.copy()
+for old_id, new_id in results['label_map'].items():
+    gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
+```
+
+- 通过 `reduce_zero_label` 修改
+  对于常见的忽略 0 号标签的场景, `BaseSegDataset` 的子类中可以用 `reduce_zero_label` 输入参数来控制。`reduce_zero_label` (默认为 `False`)
+  用来控制是否将标签 0 忽略, 当该参数为 `True` 时(最常见的应用是 ADE20k 数据集), 对分割标签中第 0 个类别对应的类别索引改为 255 (MMSegmentation 模型中计算损失时, 默认忽略 255), 其他类别对应的类别索引减一:
+
+```python
+gt_semantic_seg[gt_semantic_seg == 0] = 255
+gt_semantic_seg = gt_semantic_seg - 1
+gt_semantic_seg[gt_semantic_seg == 254] = 255
+```
+
+## 数据集与数据变换流水线
+
+在常用接口的例子中可以看到, 输入的参数中定义了数据变换流水线参数 `pipeline`, 数据集 `__getitem__` 方法返回经过数据变换的值.
+当数据集输入参数没有定义 pipeline, 返回值和 `get_data_info` 方法返回值相同, 例如:
+
+```python
+from mmseg.datasets import CityscapesDataset
+
+data_root = 'data/cityscapes/'
+data_prefix=dict(img_path='leftImg8bit/train', seg_map_path='gtFine/train')
+dataset = CityscapesDataset(data_root=data_root, data_prefix=data_prefix, test_mode=False)
+
+print(dataset[0])
+
+{'img_path': 'data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png',
+ 'seg_map_path': 'data/cityscapes/gtFine/train/aachen/aachen_000000_000019_gtFine_labelTrainIds.png',
+ 'label_map': None,
+ 'reduce_zero_label': False,
+ 'seg_fields': [],
+ 'sample_idx': 0}
+```
diff --git a/docs/zh_cn/advanced_guides/engine.md b/docs/zh_cn/advanced_guides/engine.md
new file mode 100644
index 0000000000..79b4c8d229
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/engine.md
@@ -0,0 +1,281 @@
+# 训练引擎
+
+MMEngine 定义了一些[基础循环控制器](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py) 例如基于轮次的训练循环 (`EpochBasedTrainLoop`), 基于迭代次数的训练循环 (`IterBasedTrainLoop`), 标准的验证循环 (`ValLoop`) 和标准的测试循环 (`TestLoop`).
+OpenMMLab 的算法库如 MMSegmentation 将模型训练, 测试和推理抽象为执行器(`Runner`) 来处理. 用户可以直接使用 MMEngine 中的默认执行器, 也可以对执行器进行修改以满足定制化需求. 这个文档主要介绍用户如何配置已有的运行设定, 钩子和优化器的基本概念与使用方法.
+
+## 配置运行设定
+
+### 配置训练长度
+
+循环控制器指的是训练, 验证和测试时的执行流程, 在配置文件里面使用 `train_cfg`, `val_cfg` 和 `test_cfg` 来构建这些流程. MMSegmentation 在 `configs/_base_/schedules` 文件夹里面的 `train_cfg` 设置常用的训练长度.
+例如, 使用基于迭代次数的训练循环 (`IterBasedTrainLoop`) 去训练 80,000 个迭代次数, 并且每 8,000 iteration 做一次验证, 可以如下设置:
+
+```python
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=8000)
+```
+
+### 配置训练优化器
+
+这里是一个 SGD 优化器的例子:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
+    clip_grad=None)
+```
+
+OpenMMLab 支持 PyTorch 里面所有的优化器, 更多细节可以参考 MMEngine [优化器文档](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/optim_wrapper.md).
+
+需要强调的是, `optim_wrapper` 是 `runner` 的变量, 所以需要配置优化器时配置的字段是 `optim_wrapper` 字段. 更多关于优化器的使用方法, 可以看下面优化器的章节.
+
+### 配置训练参数调度器
+
+在配置训练参数调度器前, 推荐先了解 [MMEngine 文档](https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md) 里面关于参数调度器的基本概念.
+
+以下是一个参数调度器的例子, 训练时前 1,000 个 iteration 时采用线性变化的学习率策略作为训练预热, 从 1,000 iteration 之后直到最后 16,000 个 iteration 时则采用默认的多项式学习率衰减:
+
+```python
+param_scheduler = [
+    dict(type='LinearLR', by_epoch=False, start_factor=0.1, begin=0, end=1000),
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=1000,
+        end=160000,
+        by_epoch=False,
+    )
+]
+```
+
+注意: 当修改 `train_cfg` 里面 `max_iters` 的时候, 请确保参数调度器 `param_scheduler` 里面的参数也被同时修改.
+
+## 钩子 (Hook)
+
+### 介绍
+
+OpenMMLab 将模型训练和测试过程抽象为 `Runner`, 插入钩子可以实现在 `Runner` 中不同的训练和测试节点 (例如 "每个训练 iter 前后", "每个验证 iter 前后" 等不同阶段) 所需要的相应功能. 更多钩子机制的介绍可以参考[这里](https://www.calltutors.com/blog/what-is-hook).
+
+`Runner` 中所使用的钩子分为两类:
+
+- 默认钩子 (default hooks)
+
+它们实现了训练时所必需的功能, 在配置文件中用 `default_hooks` 定义传给 `Runner`, `Runner` 通过 [`register_default_hooks`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py#L1780) 方法注册.
+钩子有对应的优先级, 优先级越高, 越早被执行器调用. 如果优先级一样, 被调用的顺序和钩子注册的顺序一致.
+不建议用户修改默认钩子的优先级, 可以参考 [mmengine hooks 文档](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/hook.md) 了解钩子优先级的定义.
+下面是 MMSegmentation 中所用到的默认钩子：
+
+|                                                          钩子                                                          |                                              功能                                               |      优先级       |
+| :--------------------------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------: | :---------------: |
+|          [IterTimerHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/iter_timer_hook.py)           |                                   记录 iteration 花费的时间.                                    |    NORMAL (50)    |
+|              [LoggerHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/logger_hook.py)              | 从 `Runner` 里不同的组件中收集日志记录, 并将其输出到终端, JSON 文件, tensorboard, wandb 等下游. | BELOW_NORMAL (60) |
+|     [ParamSchedulerHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/param_scheduler_hook.py)      |                          更新优化器里面的一些超参数, 例如学习率的动量.                          |     LOW (70)      |
+|          [CheckpointHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py)          |                                  规律性地保存 checkpoint 文件.                                  |   VERY_LOW (90)   |
+|      [DistSamplerSeedHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sampler_seed_hook.py)       |                               确保分布式采样器 shuffle 是打开的.                                |    NORMAL (50)    |
+| [SegVisualizationHook](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/visualization/local_visualizer.py) |                                可视化验证和测试过程里的预测结果.                                |    NORMAL (50)    |
+
+MMSegmentation 会在 [`defualt_hooks`](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/_base_/schedules/schedule_160k.py#L19-L25) 里面注册一些训练所必需功能的钩子::
+
+```python
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=32000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
+```
+
+以上默认钩子除 `SegVisualizationHook` 外都是在 MMEngine 中所实现, `SegVisualizationHook` 是在 MMSegmentation 里被实现的钩子, 之后会专门介绍.
+
+- 修改默认的钩子
+
+以 `default_hooks` 里面的 `logger` 和 `checkpoint` 为例, 我们来介绍如何修改 `default_hooks` 中默认的钩子.
+
+(1) 模型保存配置
+
+`default_hooks` 使用 `checkpoint` 字段来初始化[模型保存钩子 (CheckpointHook)](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/checkpoint_hook.py#L19).
+
+```python
+checkpoint = dict(type='CheckpointHook', interval=1)
+```
+
+用户可以设置 `max_keep_ckpts` 来只保存少量的检查点或者用 `save_optimizer` 来决定是否保存 optimizer 的信息.
+更多相关参数的细节可以参考[这里](https://mmengine.readthedocs.io/zh_CN/latest/api/generated/mmengine.hooks.CheckpointHook.html#checkpointhook).
+
+(2) 日志配置
+
+`日志钩子 (LoggerHook)` 被用来收集 `执行器 (Runner)` 里面不同组件的日志信息然后写入终端, JSON 文件, tensorboard 和 wandb 等地方.
+
+```python
+logger=dict(type='LoggerHook', interval=10)
+```
+
+在最新的 1.x 版本的 MMSegmentation 里面, 一些日志钩子 (LoggerHook) 例如 `TextLoggerHook`, `WandbLoggerHook` 和 `TensorboardLoggerHook` 将不再被使用.
+作为替代, MMEngine 使用 `LogProcessor` 来处理上述钩子处理的信息, 它们现在在 [`MessageHub`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/logging/message_hub.py#L17),
+[`WandbVisBackend`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py#L324) 和 [`TensorboardVisBackend`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/visualization/vis_backend.py#L472) 里面.
+
+具体使用方法如下, 配置可视化器和同时指定可视化后端, 这里使用 Tensorboard 作为可视化器的后端:
+
+```python
+# TensorboardVisBackend
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=[dict(type='TensorboardVisBackend')], name='visualizer')
+```
+
+关于更多相关用法, 可以参考 [MMEngine 可视化后端用户教程](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/visualization.md).
+
+- 自定义钩子 (custom hooks)
+
+自定义钩子在配置通过 `custom_hooks` 定义, `Runner` 通过 [`register_custom_hooks`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py#L1820) 方法注册.
+自定义钩子优先级需要在配置文件里设置, 如果没有设置, 则会被默认设置为 `NORMAL`. 下面是部分 MMEngine 中实现的自定义钩子:
+
+|                                                  钩子                                                  |                                             用法                                             |
+| :----------------------------------------------------------------------------------------------------: | :------------------------------------------------------------------------------------------: |
+|         [EMAHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/ema_hook.py)         |               在模型训练时使用指数滑动平均 (Exponential Moving Average, EMA).                |
+| [EmptyCacheHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/empty_cache_hook.py)  |                          在训练时释放所有没有被缓存占用的 GPU 显存.                          |
+| [SyncBuffersHook](https://github.com/open-mmlab/mmengine/blob/main/mmengine/hooks/sync_buffer_hook.py) | 在每个训练 Epoch 结束时同步模型 buffer 里的参数例如 BN 里的 `running_mean` 和 `running_var`. |
+
+以下是 `EMAHook` 的用例, 配置文件中, 将已经实现的自定义钩子的配置作为 `custom_hooks` 列表中的成员.
+
+```python
+custom_hooks = [
+    dict(type='EMAHook', start_iters=500, priority='NORMAL')
+]
+```
+
+### SegVisualizationHook
+
+MMSegmentation 实现了 [`SegVisualizationHook`](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/engine/hooks/visualization_hook.py#L17), 用来在验证和测试时可视化预测结果.
+`SegVisualizationHook` 重写了基类 `Hook` 中的 `_after_iter` 方法, 在验证或测试时, 根据指定的迭代次数间隔调用 `visualizer` 的 `add_datasample` 方法绘制语义分割结果, 具体实现如下:
+
+```python
+...
+@HOOKS.register_module()
+class SegVisualizationHook(Hook):
+...
+    def _after_iter(self,
+                    runner: Runner,
+                    batch_idx: int,
+                    data_batch: dict,
+                    outputs: Sequence[SegDataSample],
+                    mode: str = 'val') -> None:
+...
+        # 如果是训练阶段或者 self.draw 为 False 则直接跳出
+        if self.draw is False or mode == 'train':
+            return
+...
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            for output in outputs:
+                img_path = output.img_path
+                img_bytes = self.file_client.get(img_path)
+                img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+                window_name = f'{mode}_{osp.basename(img_path)}'
+
+                self._visualizer.add_datasample(
+                    window_name,
+                    img,
+                    data_sample=output,
+                    show=self.show,
+                    wait_time=self.wait_time,
+                    step=runner.iter)
+
+```
+
+关于可视化更多的细节可以查看[这里](../user_guides/visualization.md).
+
+## 优化器
+
+在上面配置运行设定里, 我们给出了配置训练优化器的简单示例. 本章节将进一步详细介绍在 MMSegmentation 里如何配置优化器.
+
+### 优化器封装
+
+OpenMMLab 2.0 设计了优化器封装, 它支持不同的训练策略, 包括混合精度训练、梯度累加和梯度截断等, 用户可以根据需求选择合适的训练策略.
+优化器封装还定义了一套标准的参数更新流程, 用户可以基于这一套流程, 在同一套代码里, 实现不同训练策略的切换. 如果想了解更多, 可以参考 [MMEngine 优化器封装文档](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/optim_wrapper.md).
+
+以下是 MMSegmentation 中常用的使用方法:
+
+#### 配置 PyTorch 支持的优化器
+
+OpenMMLab 2.0 支持 PyTorch 原生所有优化器, 参考[这里](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/optim_wrapper.md#%E7%AE%80%E5%8D%95%E9%85%8D%E7%BD%AE).
+
+在配置文件中设置训练时 `Runner` 所使用的优化器, 需要定义 `optim_wrapper`, 而不是 `optimizer`, 下面是一个配置训练中优化器的例子:
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005),
+    clip_grad=None)
+```
+
+#### 配置梯度裁剪
+
+当模型训练需要使用梯度裁剪的训练技巧式, 可以按照如下示例进行配置:
+
+```python
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer,
+                        clip_grad=dict(max_norm=0.01, norm_type=2))
+```
+
+这里 `max_norm` 指的是裁剪后梯度的最大值,  `norm_type` 指的是裁剪梯度时使用的范数. 相关方法可参考 [torch.nn.utils.clip_grad_norm\_](https://pytorch.org/docs/stable/generated/torch.nn.utils.clip_grad_norm_.html).
+
+#### 配置混合精度训练
+
+当需要使用混合精度训练降低内存时, 可以使用 `AmpOptimWrapper`, 具体配置如下:
+
+```python
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0001)
+optim_wrapper = dict(type='AmpOptimWrapper', optimizer=optimizer)
+```
+
+[`AmpOptimWrapper`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/amp_optimizer_wrapper.py#L20) 中 `loss_scale` 的默认设置是 `dynamic`.
+
+#### 配置模型网络不同层的超参数
+
+在模型训练中, 如果想在优化器里为不同参数分别设置优化策略, 例如设置不同的学习率、权重衰减等超参数, 可以通过设置配置文件里 `optim_wrapper` 中的 `paramwise_cfg` 来实现.
+
+下面的配置文件以 [ViT `optim_wrapper`](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/vit/vit_vit-b16-ln_mln_upernet_8xb2-160k_ade20k-512x512.py#L15-L27) 为例介绍 `paramwise_cfg` 参数使用.
+训练时将 `pos_embed`, `mask_token`, `norm` 模块的 weight decay 参数的系数设置成 0.
+即: 在训练时, 这些模块的 weight decay 将被变为 `weight_decay * decay_mult`=0.
+
+```python
+optimizer = dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01)
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=optimizer,
+    paramwise_cfg=dict(
+        custom_keys={
+            'pos_embed': dict(decay_mult=0.),
+            'cls_token': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+```
+
+其中 `decay_mult` 指的是对应参数的权重衰减的系数.
+关于更多 `paramwise_cfg` 的使用可以在 [MMEngine 优化器封装文档](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/tutorials/optim_wrapper.md) 里面查到.
+
+### 优化器封装构造器
+
+默认的优化器封装构造器 [`DefaultOptimWrapperConstructor`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L19) 根据输入的 `optim_wrapper` 和 `optim_wrapper` 中定义的 `paramwise_cfg` 来构建训练中使用的优化器. 当 [`DefaultOptimWrapperConstructor`](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py#L19) 功能不能满足需求时, 可以自定义优化器封装构造器来实现超参数的配置.
+
+MMSegmentation 中的实现了 [`LearningRateDecayOptimizerConstructor`](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py#L104), 可以对以 ConvNeXt, BEiT 和 MAE 为骨干网络的模型训练时, 骨干网络的模型参数的学习率按照定义的衰减比例（`decay_rate`）逐层递减, 在配置文件中的配置如下:
+
+```python
+optim_wrapper = dict(
+    _delete_=True,
+    type='AmpOptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0001, betas=(0.9, 0.999), weight_decay=0.05),
+    paramwise_cfg={
+        'decay_rate': 0.9,
+        'decay_type': 'stage_wise',
+        'num_layers': 12
+    },
+    constructor='LearningRateDecayOptimizerConstructor',
+    loss_scale='dynamic')
+```
+
+`_delete_=True` 的作用是 OpenMMLab Config 中的忽略继承的配置, 在该代码片段中忽略继承的 `optim_wrapper` 配置, 更多 `_delete_` 字段的内容可以参考 [MMEngine 文档](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/config.md#%E5%88%A0%E9%99%A4%E5%AD%97%E5%85%B8%E4%B8%AD%E7%9A%84-key).
diff --git a/docs/zh_cn/advanced_guides/evaluation.md b/docs/zh_cn/advanced_guides/evaluation.md
new file mode 100644
index 0000000000..dc93a46e13
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/evaluation.md
@@ -0,0 +1,158 @@
+# 模型评测
+
+模型评测过程会分别在 [ValLoop](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L300) 和 [TestLoop](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/loops.py#L373) 中被执行，用户可以在训练期间或使用配置文件中简单设置的测试脚本进行模型性能评估。`ValLoop` 和 `TestLoop` 属于 [Runner](https://github.com/open-mmlab/mmengine/blob/main/mmengine/runner/runner.py#L59)，它们会在第一次被调用时构建。由于 `dataloader` 与 `evaluator` 是必需的参数，所以要成功构建 `ValLoop`，在构建 `Runner` 时必须设置 `val_dataloader` 和 `val_evaluator`，`TestLoop` 亦然。有关 Runner 设计的更多信息，请参阅 [MMEngine](https://github.com/open-mmlab/mmengine) 的[文档](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/design/runner.md)。
+
+<center>
+  <img src='../../../resources/test_step.png' />
+  <center>测试/验证 数据流</center>
+</center>
+
+在 MMSegmentation 中，默认情况下，我们将 dataloader 和 metrics 的设置写在数据集配置文件中，并将 evaluation loop 的配置写在 `schedule_x` 配置文件中。
+
+例如，在 ADE20K 配置文件 `configs/_base_/dataset/ADE20K.py` 中，在第37到48行，我们配置了 `val_dataloader`，在第51行，我们选择 `IoUMetric` 作为 evaluator，并设置 `mIoU` 作为指标：
+
+```python
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+```
+
+为了能够在训练期间进行评估模型，我们将评估配置添加到了 `configs/schedules/schedule_40k.py` 文件的第15至16行：
+
+```python
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=40000, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+```
+
+使用以上两种设置，MMSegmentation 在 40K 迭代训练期间，每 4000 次迭代进行一次模型 **mIoU** 指标的评估。
+
+如果我们希望在训练后测试模型，则需要将 `test_dataloader`、`test_evaluator` 和 `test_cfg` 配置添加到配置文件中。
+
+```python
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_cfg = dict(type='TestLoop')
+```
+
+在 MMSegmentation 中，默认情况下，`test_dataloader` 和 `test_evaluator` 的设置与 `ValLoop` 的 dataloader 和 evaluator 相同，我们可以修改这些设置以满足我们的需要。
+
+## IoUMetric
+
+MMSegmentation 基于 [MMEngine](https://github.com/open-mmlab/mmengine) 提供的 [BaseMetric](https://github.com/open-mmlab/mmengine/blob/main/mmengine/evaluator/metric.py) 实现 [IoUMetric](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/evaluation/metrics/iou_metric.py) 和 [CityscapesMetric](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/evaluation/metrics/citys_metric.py)，以评估模型的性能。有关统一评估接口的更多详细信息，请参阅[文档](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/evaluation.html)。
+
+这里我们简要介绍 `IoUMetric` 的参数和两种主要方法。
+
+除了 `collect_device` 和 `prefix` 之外，`IoUMetric` 的构建还包含一些其他参数。
+
+构造函数的参数：
+
+- ignore_index（int）- 将在评估中忽略的类别索引。默认值：255。
+- iou_metrics（list\[str\] | str）- 需要计算的指标，可选项包括 'mIoU'、'mDice' 和 'mFscore'。
+- nan_to_num（int，可选）- 如果指定，NaN 值将被用户定义的数字替换。默认值：None。
+- beta（int）- 决定综合评分中 recall 的权重。默认值：1。
+- collect_device（str）- 用于在分布式训练期间从不同进程收集结果的设备名称。必须是 'cpu' 或 'gpu'。默认为 'cpu'。
+- prefix（str，可选）- 将添加到指标名称中的前缀，以消除不同 evaluator 的同名指标的歧义。如果参数中未提供前缀，则将使用 self.default_prefix 进行替代。默认为 None。
+
+`IoUMetric` 实现 IoU 指标的计算，`IoUMetric` 的两个核心方法是 `process` 和 `compute_metrics`。
+
+- `process` 方法处理一批 data 和 data_samples。
+- `compute_metrics` 方法根据处理的结果计算指标。
+
+### IoUMetric.process
+
+参数：
+
+- data_batch（Any）- 来自 dataloader 的一批数据。
+- data_samples（Sequence\[dict\]）- 模型的一批输出。
+
+返回值：
+
+此方法没有返回值，因为处理的结果将存储在 `self.results` 中，以在处理完所有批次后进行指标的计算。
+
+### IoUMetric.compute_metrics
+
+参数：
+
+- results（list）- 每个批次的处理结果。
+
+返回值：
+
+- Dict\[str，float\] - 计算的指标。指标的名称为 key，值是相应的结果。key 主要包括 **aAcc**、**mIoU**、**mAcc**、**mDice**、**mFscore**、**mPrecision**、**mPrecall**。
+
+## CityscapesMetric
+
+`CityscapesMetric` 使用由 Cityscapes 官方提供的 [CityscapesScripts](https://github.com/mcordts/cityscapesScripts) 进行模型性能的评估。
+
+### 使用方法
+
+在使用之前，请先安装 `cityscapesscripts` 包：
+
+```shell
+pip install cityscapesscripts
+```
+
+由于 `IoUMetric` 在 MMSegmentation 中作为默认的 evaluator 使用，如果您想使用 `CityscapesMetric`，则需要自定义配置文件。在自定义配置文件中，应按如下方式替换默认 evaluator。
+
+```python
+val_evaluator = dict(type='CityscapesMetric', output_dir='tmp')
+test_evaluator = val_evaluator
+```
+
+### 接口
+
+构造函数的参数：
+
+- output_dir (str) - 预测结果输出的路径
+- ignore_index (int) - 将在评估中忽略的类别索引。默认值：255。
+- format_only (bool) - 只为提交进行结果格式化而不进行评估。当您希望将结果格式化为特定格式并将其提交给测试服务器时有用。默认为 False。
+- keep_results (bool) - 是否保留结果。当 `format_only` 为 True 时，`keep_results` 必须为 True。默认为 False。
+- collect_device (str) - 用于在分布式训练期间从不同进程收集结果的设备名称。必须是 'cpu' 或 'gpu'。默认为 'cpu'。
+- prefix (str，可选) - 将添加到指标名称中的前缀，以消除不同 evaluator 的同名指标的歧义。如果参数中未提供前缀，则将使用 self.default_prefix 进行替代。默认为 None。
+
+#### CityscapesMetric.process
+
+该方法将在图像上绘制 mask，并将绘制的图像保存到 `work_dir` 中。
+
+参数：
+
+- data_batch（dict）- 来自 dataloader 的一批数据。
+- data_samples（Sequence\[dict\]）- 模型的一批输出。
+
+返回值：
+
+此方法没有返回值，因为处理的结果将存储在 `self.results` 中，以在处理完所有批次后进行指标的计算。
+
+#### CityscapesMetric.compute_metrics
+
+此方法将调用 `cityscapessscripts.evaluation.evalPixelLevelSemanticLabeling` 工具来计算指标。
+
+参数：
+
+- results（list）- 数据集的测试结果。
+
+返回值：
+
+- dict\[str:float\] - Cityscapes 评测结果。
diff --git a/docs/zh_cn/advanced_guides/index.rst b/docs/zh_cn/advanced_guides/index.rst
new file mode 100644
index 0000000000..2aec1ac9cf
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/index.rst
@@ -0,0 +1,26 @@
+基本概念
+***************
+
+.. toctree::
+   :maxdepth: 1
+
+   data_flow.md
+   structures.md
+   models.md
+   datasets.md
+   transforms.md
+   evaluation.md
+   engine.md
+   training_tricks.md
+
+自定义组件
+************************
+
+.. toctree::
+   :maxdepth: 1
+
+   add_models.md
+   add_datasets.md
+   add_transforms.md
+   add_metrics.md
+   customize_runtime.md
diff --git a/docs/zh_cn/advanced_guides/models.md b/docs/zh_cn/advanced_guides/models.md
new file mode 100644
index 0000000000..6eb22517a4
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/models.md
@@ -0,0 +1,177 @@
+# 模型
+
+我们通常将深度学习任务中的神经网络定义为模型，这个模型即是算法的核心。[MMEngine](https://github.com/open-mmlab/mmengine) 抽象出了一个统一模型 [BaseModel](https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_model/base_model.py#L16) 以标准化训练、测试和其他过程。MMSegmentation 实现的所有模型都继承自 `BaseModel`，并且在 MMSegmention 中，我们实现了前向传播并为语义分割算法添加了一些功能。
+
+## 常用组件
+
+### 分割器（Segmentor）
+
+在 MMSegmentation 中，我们将网络架构抽象为**分割器**，它是一个包含网络所有组件的模型。我们已经实现了**编码器解码器（EncoderDecoder）**和**级联编码器解码器（CascadeEncoderDecoder）**，它们通常由**数据预处理器**、**骨干网络**、**解码头**和**辅助头**组成。
+
+### 数据预处理器（Data preprocessor）
+
+**数据预处理器**是将数据复制到目标设备并将数据预处理为模型输入格式的部分。
+
+### 主干网络（Backbone）
+
+**主干网络**是将图像转换为特征图的部分，例如没有最后全连接层的 **ResNet-50**。
+
+### 颈部（Neck）
+
+**颈部**是连接主干网络和头的部分。它对主干网络生成的原始特征图进行一些改进或重新配置。例如 **Feature Pyramid Network（FPN）**。
+
+### 解码头（Decode head）
+
+**解码头**是将特征图转换为分割掩膜的部分，例如 **PSPNet**。
+
+### 辅助头（Auxiliary head）
+
+**辅助头**是一个可选组件，它将特征图转换为仅用于计算辅助损失的分割掩膜。
+
+## 基本接口
+
+MMSegmentation 封装 `BaseModel` 并实现了 [BaseSegmentor](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/models/segmentors/base.py#L15) 类，主要提供 `forward`、`train_step`、`val_step` 和 `test_step` 接口。接下来将详细介绍这些接口。
+
+### forward
+
+<center>
+  <img src='https://user-images.githubusercontent.com/15952744/228827860-c0e34875-d370-4736-84f0-9560c26c9576.png' />
+  <center>编码器解码器数据流</center>
+</center>
+
+<center>
+  <center><img src='https://user-images.githubusercontent.com/15952744/228827987-aa214507-0c6d-4a08-8ce4-679b2b200b79.png' /></center>
+  <center>级联编码器解码器数据流</center>
+</center>
+
+`forward` 方法返回训练、验证、测试和简单推理过程的损失或预测。
+
+该方法应接受三种模式：“tensor”、“predict” 和 “loss”：
+
+- “tensor”：前向推理整个网络并返回张量或张量数组，无需任何后处理，与常见的 `nn.Module` 相同。
+- “predict”：前向推理并返回预测值，这些预测值将被完全处理到 `SegDataSample` 列表中。
+- “loss”：前向推理并根据给定的输入和数据样本返回损失的`字典`。
+
+**注：**[SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py) 是 MMSegmentation 的数据结构接口，用作不同组件之间的接口。`SegDataSample` 实现了抽象数据元素 `mmengine.structures.BaseDataElement`，请参阅 [MMMEngine](https://github.com/open-mmlab/mmengine) 中的 [SegDataSample 文档](https://mmsegmentation.readthedocs.io/zh_CN/1.x/advanced_guides/structures.html)和[数据元素文档](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/data_element.html)了解更多信息。
+
+注意，此方法不处理在 `train_step` 方法中完成的反向传播或优化器更新。
+
+参数：
+
+- inputs（torch.Tensor）- 通常为形状是（N, C, ...) 的输入张量。
+- data_sample（list\[[SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py)\]) - 分割数据样本。它通常包括 `metainfo` 和 `gt_sem_seg` 等信息。默认值为 None。
+- mode (str) - 返回什么类型的值。默认为 'tensor'。
+
+返回值：
+
+- `dict` 或 `list`：
+  - 如果 `mode == "loss"`，则返回用于反向过程和日志记录的损失张量`字典`。
+  - 如果 `mode == "predict"`，则返回 `SegDataSample` 的`列表`，推理结果将被递增地添加到传递给 forward 方法的 `data_sample` 参数中，每个 `SegDataSeample` 包含以下关键词：
+    - pred_sm_seg (`PixelData`)：语义分割的预测。
+    - seg_logits (`PixelData`)：标准化前语义分割的预测指标。
+  - 如果 `mode == "tensor"`，则返回`张量`或`张量数组`的`字典`以供自定义使用。
+
+### 预测模式
+
+我们在[配置文档](../user_guides/1_config.md)中简要描述了模型配置的字段，这里我们详细介绍 `model.test_cfg` 字段。`model.test_cfg` 用于控制前向行为，`"predict"` 模式下的 `forward` 方法可以在两种模式下运行：
+
+- `whole_inference`：如果 `cfg.model.test_cfg.mode == 'whole'`，则模型将使用完整图像进行推理。
+
+  `whole_inference` 模式的一个示例配置：
+
+  ```python
+  model = dict(
+    type='EncoderDecoder'
+    ...
+    test_cfg=dict(mode='whole')
+  )
+  ```
+
+- `slide_inference`：如果 `cfg.model.test_cfg.mode == ‘slide’`，则模型将通过滑动窗口进行推理。**注意：** 如果选择 `slide` 模式，还应指定 `cfg.model.test_cfg.stride` 和 `cfg.model.test_cfg.crop_size`。
+
+  `slide_inference` 模式的一个示例配置：
+
+  ```python
+  model = dict(
+    type='EncoderDecoder'
+    ...
+    test_cfg=dict(mode='slide', crop_size=256, stride=170)
+  )
+  ```
+
+### train_step
+
+`train_step` 方法调用 `loss` 模式的前向接口以获得损失`字典`。`BaseModel` 类实现默认的模型训练过程，包括预处理、模型前向传播、损失计算、优化和反向传播。
+
+参数：
+
+- data (dict or tuple or list) - 从数据集采样的数据。在 MMSegmentation 中，数据字典包含 `inputs` 和 `data_samples` 两个字段。
+- optim_wrapper (OptimWrapper) - 用于更新模型参数的 OptimWrapper 实例。
+
+**注：**[OptimWrapper](https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/optimizer_wrapper.py#L17) 提供了一个用于更新参数的通用接口，请参阅 [MMMEngine](https://github.com/open-mmlab/mmengine) 中的优化器封装[文档](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/optim_wrapper.html)了解更多信息。
+
+返回值：
+
+-Dict\[str, `torch.Tensor`\]：用于记录日志的张量的`字典`。
+
+<center>
+  <img src='https://user-images.githubusercontent.com/15952744/228828089-a9ae1225-958d-4cf7-99af-9af8576f7ef7.png' />
+  <center>train_step 数据流</center>
+</center>
+
+### val_step
+
+`val_step` 方法调用 `predict` 模式的前向接口并返回预测结果，预测结果将进一步被传递给评测器的进程接口和钩子的 `after_val_inter` 接口。
+
+参数：
+
+- data (`dict` or `tuple` or `list`) - 从数据集中采样的数据。在 MMSegmentation 中，数据字典包含 `inputs` 和 `data_samples` 两个字段。
+
+返回值：
+
+- `list` - 给定数据的预测结果。
+
+<center>
+  <img src='https://user-images.githubusercontent.com/15952744/228828179-3269baa3-bebd-4c9a-9787-59e7d785fbcf.png' />
+  <center>test_step/val_step 数据流</center>
+</center>
+
+### test_step
+
+`BaseModel` 中 `test_step` 与 `val_step` 的实现相同。
+
+## 数据预处理器（Data Preprocessor）
+
+MMSegmentation 实现的 [SegDataPreProcessor](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/models/data_preprocessor.py#L13) 继承自由 [MMEngine](https://github.com/open-mmlab/mmengine) 实现的 [BaseDataPreprocessor](https://github.com/open-mmlab/mmengine/blob/main/mmengine/model/base_model/data_preprocessor.py#L18)，提供数据预处理和将数据复制到目标设备的功能。
+
+Runner 在构建阶段将模型传送到指定的设备，而 [SegDataPreProcessor](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/models/data_preprocessor.py#L13) 在 `train_step`、`val_step` 和 `test_step` 中将数据传送到指定设备，之后处理后的数据将被进一步传递给模型。
+
+`SegDataPreProcessor` 构造函数的参数：
+
+- mean (Sequence\[Number\], 可选) - R、G、B 通道的像素平均值。默认为 None。
+- std (Sequence\[Number\], 可选) - R、G、B 通道的像素标准差。默认为 None。
+- size (tuple, 可选) - 固定的填充大小。
+- size_divisor (int, 可选) - 填充大小的除法因子。
+- pad_val (float, 可选) - 填充值。默认值：0。
+- seg_pad_val (float, 可选) - 分割图的填充值。默认值：255。
+- bgr_to_rgb (bool) - 是否将图像从 BGR 转换为 RGB。默认为 False。
+- rgb_to_bgr (bool) - 是否将图像从 RGB 转换为 BGR。默认为 False。
+- batch_augments (list\[dict\], 可选) - 批量化的数据增强。默认值为 None。
+
+数据将按如下方式处理：
+
+- 收集数据并将其移动到目标设备。
+- 用定义的 `pad_val` 将输入填充到输入大小，并用定义的 `seg_Pad_val` 填充分割图。
+- 将输入堆栈到 batch_input。
+- 如果输入的形状为 (3, H, W)，则将输入从 BGR 转换为 RGB。
+- 使用定义的标准差和平均值标准化图像。
+- 在训练期间进行如 Mixup 和 Cutmix 的批量化数据增强。
+
+`forward` 方法的参数：
+
+- data (dict) - 从数据加载器采样的数据。
+- training (bool) - 是否启用训练时数据增强。
+
+`forward` 方法的返回值：
+
+- Dict：与模型输入格式相同的数据。
diff --git a/docs/zh_cn/advanced_guides/structures.md b/docs/zh_cn/advanced_guides/structures.md
new file mode 100644
index 0000000000..958e011a7b
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/structures.md
@@ -0,0 +1,102 @@
+# 数据结构
+
+为了统一模型和各功能模块之间的输入和输出的接口, 在 OpenMMLab 2.0 MMEngine 中定义了一套抽象数据结构, 实现了基础的增/删/查/改功能, 支持不同设备间的数据迁移, 也支持了如
+`.cpu()`, `.cuda()`, `.get()` 和 `.detach()` 的类字典和张量的操作。具体可以参考 [MMEngine 文档](https://github.com/open-mmlab/mmengine/blob/main/docs/en/advanced_tutorials/data_element.md)。
+
+同样的, MMSegmentation 亦遵循了 OpenMMLab 2.0 各模块间的接口协议, 定义了 `SegDataSample` 用来封装语义分割任务所需要的数据。
+
+## 语义分割数据 SegDataSample
+
+[SegDataSample](mmseg.structures.SegDataSample) 包括了三个主要数据字段 `gt_sem_seg`, `pred_sem_seg` 和 `seg_logits`, 分别用来存放标注信息, 预测结果和预测的未归一化前的 logits 值。
+
+| 字段           | 类型                      | 描述                            |
+| -------------- | ------------------------- | ------------------------------- |
+| gt_sem_seg     | [`PixelData`](#pixeldata) | 图像标注信息.                   |
+| pred_instances | [`PixelData`](#pixeldata) | 图像预测结果.                   |
+| seg_logits     | [`PixelData`](#pixeldata) | 模型预测未归一化前的 logits 值. |
+
+以下示例代码展示了 `SegDataSample` 的使用方法：
+
+```python
+import torch
+from mmengine.structures import PixelData
+from mmseg.structures import SegDataSample
+
+img_meta = dict(img_shape=(4, 4, 3),
+                 pad_shape=(4, 4, 3))
+data_sample = SegDataSample()
+# 定义 gt_segmentations 用于封装模型的输出信息
+gt_segmentations = PixelData(metainfo=img_meta)
+gt_segmentations.data = torch.randint(0, 2, (1, 4, 4))
+
+# 增加和处理 SegDataSample　中的属性
+data_sample.gt_sem_seg = gt_segmentations
+assert 'gt_sem_seg' in data_sample
+assert 'data' in data_sample.gt_sem_seg
+assert 'img_shape' in data_sample.gt_sem_seg.metainfo_keys()
+print(data_sample.gt_sem_seg.shape)
+'''
+(4, 4)
+'''
+print(data_sample)
+'''
+<SegDataSample(
+
+    META INFORMATION
+
+    DATA FIELDS
+    gt_sem_seg: <PixelData(
+
+            META INFORMATION
+            img_shape: (4, 4, 3)
+            pad_shape: (4, 4, 3)
+
+            DATA FIELDS
+            data: tensor([[[1, 1, 1, 0],
+                         [1, 0, 1, 1],
+                         [1, 1, 1, 1],
+                         [0, 1, 0, 1]]])
+        ) at 0x1c2b4156460>
+) at 0x1c2aae44d60>
+'''
+
+# 删除和修改 SegDataSample　中的属性
+data_sample = SegDataSample()
+gt_segmentations = PixelData(metainfo=img_meta)
+gt_segmentations.data = torch.randint(0, 2, (1, 4, 4))
+data_sample.gt_sem_seg = gt_segmentations
+data_sample.gt_sem_seg.set_metainfo(dict(img_shape=(4,4,9), pad_shape=(4,4,9)))
+del data_sample.gt_sem_seg.img_shape
+
+# 类张量的操作
+data_sample = SegDataSample()
+gt_segmentations = PixelData(metainfo=img_meta)
+gt_segmentations.data = torch.randint(0, 2, (1, 4, 4))
+cuda_gt_segmentations = gt_segmentations.cuda()
+cuda_gt_segmentations = gt_segmentations.to('cuda:0')
+cpu_gt_segmentations = cuda_gt_segmentations.cpu()
+cpu_gt_segmentations = cuda_gt_segmentations.to('cpu')
+```
+
+## 在 SegDataSample 中自定义新的属性
+
+如果你想在 `SegDataSample` 中自定义新的属性，你可以参考下面的 [SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py) 示例:
+
+```python
+class SegDataSample(BaseDataElement):
+    ...
+
+    @property
+    def xxx_property(self) -> xxxData:
+        return self._xxx_property
+
+    @xxx_property.setter
+    def xxx_property(self, value: xxxData) -> None:
+        self.set_field(value, '_xxx_property', dtype=xxxData)
+
+    @xxx_property.deleter
+    def xxx_property(self) -> None:
+        del self._xxx_property
+```
+
+这样一个新的属性 `xxx_property` 就将被增加到 `SegDataSample` 里面了。
diff --git a/docs/zh_cn/advanced_guides/training_tricks.md b/docs/zh_cn/advanced_guides/training_tricks.md
new file mode 100644
index 0000000000..e5b8e4dae1
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/training_tricks.md
@@ -0,0 +1,74 @@
+# 训练技巧
+
+MMSegmentation 支持如下训练技巧：
+
+## 主干网络和解码头组件使用不同的学习率 (Learning Rate, LR)
+
+在语义分割里，一些方法会让解码头组件的学习率大于主干网络的学习率，这样可以获得更好的表现或更快的收敛。
+
+在 MMSegmentation 里面，您也可以在配置文件里添加如下行来让解码头组件的学习率是主干组件的10倍。
+
+```python
+optim_wrapper=dict(
+    paramwise_cfg = dict(
+        custom_keys={
+            'head': dict(lr_mult=10.)}))
+```
+
+通过这种修改，任何被分组到 `'head'` 的参数的学习率都将乘以10。您也可以参照 [MMEngine 文档](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/optim_wrapper.html#id6)  获取更详细的信息。
+
+## 在线难样本挖掘 (Online Hard Example Mining, OHEM)
+
+MMSegmentation 中实现了像素采样器，训练时可以对特定像素进行采样，例如 OHEM(Online Hard Example Mining)，可以解决样本不平衡问题，
+如下例子是使用 PSPNet 训练并采用 OHEM 策略的配置：
+
+```python
+_base_ = './pspnet_r50-d8_512x1024_40k_cityscapes.py'
+model=dict(
+    decode_head=dict(
+        sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=100000)) )
+```
+
+通过这种方式，只有置信分数在0.7以下的像素值点会被拿来训练。在训练时我们至少要保留100000个像素值点。如果 `thresh` 并未被指定，前 `min_kept`
+个损失的像素值点才会被选择。
+
+## 类别平衡损失 (Class Balanced Loss)
+
+对于不平衡类别分布的数据集，您也许可以改变每个类别的损失权重。这里以 cityscapes 数据集为例：
+
+```python
+_base_ = './pspnet_r50-d8_512x1024_40k_cityscapes.py'
+model=dict(
+    decode_head=dict(
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0,
+            # DeepLab 对 cityscapes 使用这种权重
+            class_weight=[0.8373, 0.9180, 0.8660, 1.0345, 1.0166, 0.9969, 0.9754,
+                        1.0489, 0.8786, 1.0023, 0.9539, 0.9843, 1.1116, 0.9037,
+                        1.0865, 1.0955, 1.0865, 1.1529, 1.0507])))
+```
+
+`class_weight` 将被作为 `weight` 参数，传递给 `CrossEntropyLoss`。详细信息请参照 [PyTorch 文档](https://pytorch.org/docs/stable/nn.html?highlight=crossentropy#torch.nn.CrossEntropyLoss) 。
+
+## 同时使用多种损失函数 (Multiple Losses)
+
+对于训练时损失函数的计算，我们目前支持多个损失函数同时使用。 以 `unet` 使用 `DRIVE` 数据集训练为例，
+使用 `CrossEntropyLoss` 和 `DiceLoss` 的 `1:3` 的加权和作为损失函数。配置文件写为:
+
+```python
+_base_ = './fcn_unet_s5-d16_64x64_40k_drive.py'
+model = dict(
+    decode_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]),
+    auxiliary_head=dict(loss_decode=[
+        dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
+        dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)
+    ]),
+)
+```
+
+通过这种方式，确定训练过程中损失函数的权重 `loss_weight` 和在训练日志里的名字 `loss_name`。
+
+注意： `loss_name` 的名字必须带有 `loss_` 前缀，这样它才能被包括在计算图里。
diff --git a/docs/zh_cn/advanced_guides/transforms.md b/docs/zh_cn/advanced_guides/transforms.md
new file mode 100644
index 0000000000..e5f3bebf6d
--- /dev/null
+++ b/docs/zh_cn/advanced_guides/transforms.md
@@ -0,0 +1,119 @@
+# 数据增强变化
+
+在本教程中，我们将介绍 MMSegmentation 中数据增强变化流程的设计。
+
+本指南的结构如下：
+
+- [数据增强变化](#数据增强变化)
+  - [数据增强变化流程设计](#数据增强变化流程设计)
+    - [数据加载](#数据加载)
+    - [预处理](#预处理)
+    - [格式修改](#格式修改)
+
+## 数据增强变化流程设计
+
+按照惯例，我们使用 `Dataset` 和 `DataLoader` 多进程地加载数据。`Dataset` 返回与模型 forward 方法的参数相对应的数据项的字典。由于语义分割中的数据可能大小不同，我们在 MMCV 中引入了一种新的 `DataContainer` 类型，以帮助收集和分发不同大小的数据。参见[此处](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py)了解更多详情。
+
+在 MMSegmentation 的 1.x 版本中，所有数据转换都继承自 [`BaseTransform`](https://github.com/open-mmlab/mmcv/blob/2.x/mmcv/transforms/base.py#L6).
+
+转换的输入和输出类型都是字典。一个简单的示例如下：
+
+```python
+>>> from mmseg.datasets.transforms import LoadAnnotations
+>>> transforms = LoadAnnotations()
+>>> img_path = './data/cityscapes/leftImg8bit/train/aachen/aachen_000000_000019_leftImg8bit.png.png'
+>>> gt_path = './data/cityscapes/gtFine/train/aachen/aachen_000015_000019_gtFine_instanceTrainIds.png'
+>>> results = dict(
+>>>     img_path=img_path,
+>>>     seg_map_path=gt_path,
+>>>     reduce_zero_label=False,
+>>>     seg_fields=[])
+>>> data_dict = transforms(results)
+>>> print(data_dict.keys())
+dict_keys(['img_path', 'seg_map_path', 'reduce_zero_label', 'seg_fields', 'gt_seg_map'])
+```
+
+数据准备流程和数据集是解耦的。通常，数据集定义如何处理标注，数据流程定义准备数据字典的所有步骤。流程由一系列操作组成。每个操作都将字典作为输入，并为接下来的转换输出字典。
+
+操作分为数据加载、预处理、格式修改和测试数据增强。
+
+这里是 PSPNet 的流程示例：
+
+```python
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+```
+
+对于每个操作，我们列出了 `添加`/`更新`/`删除` 相关的字典字段。在流程前，我们可以从数据集直接获得的信息是 `img_path` 和 `seg_map_path`。
+
+### 数据加载
+
+`LoadImageFromFile`：从文件加载图像。
+
+- 添加：`img`，`img_shape`，`ori_shape`
+
+`LoadAnnotations`：加载数据集提供的语义分割图。
+
+- 添加：`seg_fields`，`gt_seg_map`
+
+### 预处理
+
+`RandomResize`：随机调整图像和分割图大小。
+
+-添加：`scale`，`scale_factor`，`keep_ratio`
+-更新：`img`，`img_shape`，`gt_seg_map`
+
+`Resize`：调整图像和分割图的大小。
+
+-添加：`scale`，`scale_factor`，`keep_ratio`
+-更新：`img`，`gt_seg_map`，`img_shape`
+
+`RandomCrop`：随机裁剪图像和分割图。
+
+-更新：`img`，`gt_seg_map`，`img_shape`
+
+`RandomFlip`：翻转图像和分割图。
+
+-添加：`flip`，`flip_direction`
+-更新：`img`，`gt_seg_map`
+
+`PhotoMetricDistortion`：按顺序对图像应用光度失真，每个变换的应用概率为 0.5。随机对比度的位置是第二或倒数第二（分别为下面的模式 0 或 1）。
+
+```
+1.随机亮度
+2.随机对比度（模式 0）
+3.将颜色从 BGR 转换为 HSV
+4.随机饱和度
+5.随机色调
+6.将颜色从 HSV 转换为 BGR
+7.随机对比度（模式 1）
+```
+
+- 更新：`img`
+
+### 格式修改
+
+`PackSegInputs`：为语义分段打包输入数据。
+
+- 添加：`inputs`，`data_sample`
+- 删除：由 `meta_keys` 指定的 keys（合并到 data_sample 的 metainfo 中），所有其他 keys
diff --git a/docs/zh_cn/api.rst b/docs/zh_cn/api.rst
index 8285841dc6..3478aa9361 100644
--- a/docs/zh_cn/api.rst
+++ b/docs/zh_cn/api.rst
@@ -3,56 +3,93 @@ mmseg.apis
 .. automodule:: mmseg.apis
     :members:
 
-mmseg.core
+mmseg.datasets
 --------------
 
-seg
-^^^^^^^^^^
-.. automodule:: mmseg.core.seg
-    :members:
-
-evaluation
+datasets
 ^^^^^^^^^^
-.. automodule:: mmseg.core.evaluation
+.. automodule:: mmseg.datasets
     :members:
 
-utils
-^^^^^^^^^^
-.. automodule:: mmseg.core.utils
+transforms
+^^^^^^^^^^^^
+.. automodule:: mmseg.datasets.transforms
     :members:
 
-mmseg.datasets
+mmseg.engine
 --------------
 
-datasets
+hooks
 ^^^^^^^^^^
-.. automodule:: mmseg.datasets
+.. automodule:: mmseg.engine.hooks
     :members:
 
-pipelines
-^^^^^^^^^^
-.. automodule:: mmseg.datasets.pipelines
+optimizers
+^^^^^^^^^^^^^^^
+.. automodule:: mmseg.engine.optimizers
     :members:
 
-mmseg.models
+mmseg.evaluation
 --------------
 
-segmentors
+metrics
 ^^^^^^^^^^
-.. automodule:: mmseg.models.segmentors
+.. automodule:: mmseg.evaluation.metrics
     :members:
 
+mmseg.models
+--------------
+
 backbones
-^^^^^^^^^^
+^^^^^^^^^^^^^^^^^^
 .. automodule:: mmseg.models.backbones
     :members:
 
 decode_heads
-^^^^^^^^^^^^
+^^^^^^^^^^^^^^^
 .. automodule:: mmseg.models.decode_heads
     :members:
 
+segmentors
+^^^^^^^^^^
+.. automodule:: mmseg.models.segmentors
+    :members:
+
 losses
 ^^^^^^^^^^
 .. automodule:: mmseg.models.losses
     :members:
+
+necks
+^^^^^^^^^^^^
+.. automodule:: mmseg.models.necks
+    :members:
+
+utils
+^^^^^^^^^^
+.. automodule:: mmseg.models.utils
+    :members:
+
+
+mmseg.structures
+--------------------
+
+structures
+^^^^^^^^^^^^^^^^^
+.. automodule:: mmseg.structures
+    :members:
+
+sampler
+^^^^^^^^^^
+.. automodule:: mmseg.structures.sampler
+    :members:
+
+mmseg.visualization
+--------------------
+.. automodule:: mmseg.visualization
+    :members:
+
+mmseg.utils
+--------------
+.. automodule:: mmseg.utils
+    :members:
diff --git a/docs/zh_cn/conf.py b/docs/zh_cn/conf.py
index 353b0bc725..18420558dc 100644
--- a/docs/zh_cn/conf.py
+++ b/docs/zh_cn/conf.py
@@ -28,7 +28,7 @@
 
 
 def get_version():
-    with open(version_file, 'r') as f:
+    with open(version_file) as f:
         exec(compile(f.read(), version_file, 'exec'))
     return locals()['__version__']
 
diff --git a/docs/zh_cn/dataset_prepare.md b/docs/zh_cn/dataset_prepare.md
deleted file mode 100644
index 6b9c8216e5..0000000000
--- a/docs/zh_cn/dataset_prepare.md
+++ /dev/null
@@ -1,319 +0,0 @@
-## 准备数据集
-
-推荐用软链接，将数据集根目录链接到 `$MMSEGMENTATION/data` 里。如果您的文件夹结构是不同的，您也许可以试着修改配置文件里对应的路径。
-
-```none
-mmsegmentation
-├── mmseg
-├── tools
-├── configs
-├── data
-│   ├── cityscapes
-│   │   ├── leftImg8bit
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   ├── gtFine
-│   │   │   ├── train
-│   │   │   ├── val
-│   ├── VOCdevkit
-│   │   ├── VOC2012
-│   │   │   ├── JPEGImages
-│   │   │   ├── SegmentationClass
-│   │   │   ├── ImageSets
-│   │   │   │   ├── Segmentation
-│   │   ├── VOC2010
-│   │   │   ├── JPEGImages
-│   │   │   ├── SegmentationClassContext
-│   │   │   ├── ImageSets
-│   │   │   │   ├── SegmentationContext
-│   │   │   │   │   ├── train.txt
-│   │   │   │   │   ├── val.txt
-│   │   │   ├── trainval_merged.json
-│   │   ├── VOCaug
-│   │   │   ├── dataset
-│   │   │   │   ├── cls
-│   ├── ade
-│   │   ├── ADEChallengeData2016
-│   │   │   ├── annotations
-│   │   │   │   ├── training
-│   │   │   │   ├── validation
-│   │   │   ├── images
-│   │   │   │   ├── training
-│   │   │   │   ├── validation
-│   ├── CHASE_DB1
-│   │   ├── images
-│   │   │   ├── training
-│   │   │   ├── validation
-│   │   ├── annotations
-│   │   │   ├── training
-│   │   │   ├── validation
-│   ├── DRIVE
-│   │   ├── images
-│   │   │   ├── training
-│   │   │   ├── validation
-│   │   ├── annotations
-│   │   │   ├── training
-│   │   │   ├── validation
-│   ├── HRF
-│   │   ├── images
-│   │   │   ├── training
-│   │   │   ├── validation
-│   │   ├── annotations
-│   │   │   ├── training
-│   │   │   ├── validation
-│   ├── STARE
-│   │   ├── images
-│   │   │   ├── training
-│   │   │   ├── validation
-│   │   ├── annotations
-│   │   │   ├── training
-│   │   │   ├── validation
-|   ├── dark_zurich
-|   │   ├── gps
-|   │   │   ├── val
-|   │   │   └── val_ref
-|   │   ├── gt
-|   │   │   └── val
-|   │   ├── LICENSE.txt
-|   │   ├── lists_file_names
-|   │   │   ├── val_filenames.txt
-|   │   │   └── val_ref_filenames.txt
-|   │   ├── README.md
-|   │   └── rgb_anon
-|   │   |   ├── val
-|   │   |   └── val_ref
-|   ├── NighttimeDrivingTest
-|   |   ├── gtCoarse_daytime_trainvaltest
-|   |   │   └── test
-|   |   │       └── night
-|   |   └── leftImg8bit
-|   |   |   └── test
-|   |   |       └── night
-│   ├── loveDA
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   │   ├── test
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   ├── potsdam
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   ├── vaihingen
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   ├── iSAID
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   ├── val
-│   │   │   ├── test
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   ├── val
-```
-
-### Cityscapes
-
-注册成功后，数据集可以在 [这里](https://www.cityscapes-dataset.com/downloads/) 下载。
-
-通常情况下，`**labelTrainIds.png` 被用来训练 cityscapes。
-基于 [cityscapesscripts](https://github.com/mcordts/cityscapesScripts),
-我们提供了一个 [脚本](https://github.com/open-mmlab/mmsegmentation/blob/master/tools/convert_datasets/cityscapes.py),
-去生成 `**labelTrainIds.png`。
-
-```shell
-# --nproc 8 意味着有 8 个进程用来转换，它也可以被忽略。
-python tools/convert_datasets/cityscapes.py data/cityscapes --nproc 8
-```
-
-### Pascal VOC
-
-Pascal VOC 2012 可以在 [这里](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar) 下载。
-此外，许多最近在 Pascal VOC 数据集上的工作都会利用增广的数据，它们可以在 [这里](http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz) 找到。
-
-如果您想使用增广后的 VOC 数据集，请运行下面的命令来将数据增广的标注转成正确的格式。
-
-```shell
-# --nproc 8 意味着有 8 个进程用来转换，它也可以被忽略。
-python tools/convert_datasets/voc_aug.py data/VOCdevkit data/VOCdevkit/VOCaug --nproc 8
-```
-
-关于如何拼接数据集 (concatenate) 并一起训练它们，更多细节请参考 [拼接连接数据集](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/tutorials/customize_datasets.md#%E6%8B%BC%E6%8E%A5%E6%95%B0%E6%8D%AE%E9%9B%86) 。
-
-### ADE20K
-
-ADE20K 的训练集和验证集可以在 [这里](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip) 下载。
-您还可以在 [这里](http://data.csail.mit.edu/places/ADEchallenge/release_test.zip) 下载验证集。
-
-### Pascal Context
-
-Pascal Context 的训练集和验证集可以在 [这里](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar) 下载。
-注册成功后，您还可以在 [这里](http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2010test.tar) 下载验证集。
-
-为了从原始数据集里切分训练集和验证集， 您可以在 [这里](https://codalabuser.blob.core.windows.net/public/trainval_merged.json)
-下载 trainval_merged.json。
-
-如果您想使用 Pascal Context 数据集，
-请安装 [细节](https://github.com/zhanghang1989/detail-api) 然后再运行如下命令来把标注转换成正确的格式。
-
-```shell
-python tools/convert_datasets/pascal_context.py data/VOCdevkit data/VOCdevkit/VOC2010/trainval_merged.json
-```
-
-### CHASE DB1
-
-CHASE DB1 的训练集和验证集可以在 [这里](https://staffnet.kingston.ac.uk/~ku15565/CHASE_DB1/assets/CHASEDB1.zip) 下载。
-
-为了将 CHASE DB1 数据集转换成 MMSegmentation 的格式，您需要运行如下命令：
-
-```shell
-python tools/convert_datasets/chase_db1.py /path/to/CHASEDB1.zip
-```
-
-这个脚本将自动生成正确的文件夹结构。
-
-### DRIVE
-
-DRIVE 的训练集和验证集可以在 [这里](https://drive.grand-challenge.org/) 下载。
-在此之前，您需要注册一个账号，当前 '1st_manual' 并未被官方提供，因此需要您从其他地方获取。
-
-为了将 DRIVE 数据集转换成 MMSegmentation 格式，您需要运行如下命令：
-
-```shell
-python tools/convert_datasets/drive.py /path/to/training.zip /path/to/test.zip
-```
-
-这个脚本将自动生成正确的文件夹结构。
-
-### HRF
-
-首先，下载 [healthy.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/healthy.zip) [glaucoma.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/glaucoma.zip), [diabetic_retinopathy.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/diabetic_retinopathy.zip), [healthy_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/healthy_manualsegm.zip), [glaucoma_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/glaucoma_manualsegm.zip) 以及 [diabetic_retinopathy_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/diabetic_retinopathy_manualsegm.zip) 。
-
-为了将 HRF 数据集转换成 MMSegmentation 格式，您需要运行如下命令：
-
-```shell
-python tools/convert_datasets/hrf.py /path/to/healthy.zip /path/to/healthy_manualsegm.zip /path/to/glaucoma.zip /path/to/glaucoma_manualsegm.zip /path/to/diabetic_retinopathy.zip /path/to/diabetic_retinopathy_manualsegm.zip
-```
-
-这个脚本将自动生成正确的文件夹结构。
-
-### STARE
-
-首先，下载 [stare-images.tar](http://cecas.clemson.edu/~ahoover/stare/probing/stare-images.tar), [labels-ah.tar](http://cecas.clemson.edu/~ahoover/stare/probing/labels-ah.tar) 和 [labels-vk.tar](http://cecas.clemson.edu/~ahoover/stare/probing/labels-vk.tar) 。
-
-为了将 STARE 数据集转换成 MMSegmentation 格式，您需要运行如下命令：
-
-```shell
-python tools/convert_datasets/stare.py /path/to/stare-images.tar /path/to/labels-ah.tar /path/to/labels-vk.tar
-```
-
-这个脚本将自动生成正确的文件夹结构。
-
-### Dark Zurich
-
-因为我们只支持在此数据集上测试模型，所以您只需下载[验证集](https://data.vision.ee.ethz.ch/csakarid/shared/GCMA_UIoU/Dark_Zurich_val_anon.zip) 。
-
-### Nighttime Driving
-
-因为我们只支持在此数据集上测试模型，所以您只需下载[测试集](http://data.vision.ee.ethz.ch/daid/NighttimeDriving/NighttimeDrivingTest.zip) 。
-
-### LoveDA
-
-可以从 Google Drive 里下载 [LoveDA数据集](https://drive.google.com/drive/folders/1ibYV0qwn4yuuh068Rnc-w4tPi0U0c-ti?usp=sharing) 。
-
-或者它还可以从 [zenodo](https://zenodo.org/record/5706578#.YZvN7SYRXdF) 下载, 您需要运行如下命令:
-
-```shell
-# Download Train.zip
-wget https://zenodo.org/record/5706578/files/Train.zip
-# Download Val.zip
-wget https://zenodo.org/record/5706578/files/Val.zip
-# Download Test.zip
-wget https://zenodo.org/record/5706578/files/Test.zip
-```
-
-对于 LoveDA 数据集，请运行以下命令下载并重新组织数据集
-
-```shell
-python tools/convert_datasets/loveda.py /path/to/loveDA
-```
-
-请参照 [这里](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/inference.md) 来使用训练好的模型去预测 LoveDA 测试集并且提交到官网。
-
-关于 LoveDA 的更多细节可以在[这里](https://github.com/Junjue-Wang/LoveDA) 找到。
-
-### ISPRS Potsdam
-
-[Potsdam](https://www2.isprs.org/commissions/comm2/wg4/benchmark/2d-sem-label-potsdam/)
-数据集是一个有着2D 语义分割内容标注的城市遥感数据集。
-数据集可以从挑战[主页](https://www2.isprs.org/commissions/comm2/wg4/benchmark/data-request-form/) 获得。
-需要其中的 '2_Ortho_RGB.zip' 和 '5_Labels_all_noBoundary.zip'。
-
-对于 Potsdam 数据集，请运行以下命令下载并重新组织数据集
-
-```shell
-python tools/convert_datasets/potsdam.py /path/to/potsdam
-```
-
-使用我们默认的配置， 将生成 3456 张图片的训练集和 2016 张图片的验证集。
-
-### ISPRS Vaihingen
-
-[Vaihingen](https://www2.isprs.org/commissions/comm2/wg4/benchmark/2d-sem-label-vaihingen/)
-数据集是一个有着2D 语义分割内容标注的城市遥感数据集。
-
-数据集可以从挑战 [主页](https://www2.isprs.org/commissions/comm2/wg4/benchmark/data-request-form/).
-需要其中的 'ISPRS_semantic_labeling_Vaihingen.zip' 和 'ISPRS_semantic_labeling_Vaihingen_ground_truth_eroded_COMPLETE.zip'。
-
-对于 Vaihingen 数据集，请运行以下命令下载并重新组织数据集
-
-```shell
-python tools/convert_datasets/vaihingen.py /path/to/vaihingen
-```
-
-使用我们默认的配置 (`clip_size`=512, `stride_size`=256)， 将生成 344 张图片的训练集和 398 张图片的验证集。
-
-### iSAID
-
-iSAID 数据集(训练集/验证集/测试集)的图像可以从 [DOTA-v1.0](https://captain-whu.github.io/DOTA/dataset.html) 下载.
-
-iSAID 数据集(训练集/验证集)的注释可以从 [iSAID](https://captain-whu.github.io/iSAID/dataset.html) 下载.
-
-该数据集是一个大规模的实例分割(也可以用于语义分割)的遥感数据集.
-
-下载后，在数据集转换前，您需要将数据集文件夹调整成如下格式.
-
-```
-│   ├── iSAID
-│   │   ├── train
-│   │   │   ├── images
-│   │   │   │   ├── part1.zip
-│   │   │   │   ├── part2.zip
-│   │   │   │   ├── part3.zip
-│   │   │   ├── Semantic_masks
-│   │   │   │   ├── images.zip
-│   │   ├── val
-│   │   │   ├── images
-│   │   │   │   ├── part1.zip
-│   │   │   ├── Semantic_masks
-│   │   │   │   ├── images.zip
-│   │   ├── test
-│   │   │   ├── images
-│   │   │   │   ├── part1.zip
-│   │   │   │   ├── part2.zip
-```
-
-```shell
-python tools/convert_datasets/isaid.py /path/to/iSAID
-```
-
-使用我们默认的配置 (`patch_width`=896, `patch_height`=896,　`overlap_area`=384)， 将生成 33978 张图片的训练集和 11644 张图片的验证集。
diff --git a/docs/zh_cn/device/npu.md b/docs/zh_cn/device/npu.md
new file mode 100644
index 0000000000..d50439d040
--- /dev/null
+++ b/docs/zh_cn/device/npu.md
@@ -0,0 +1,39 @@
+# NPU (华为 昇腾)
+
+## 使用方法
+
+请参考 [MMCV 的安装文档](https://mmcv.readthedocs.io/en/latest/get_started/build.html#build-mmcv-full-on-ascend-npu-machine) 来安装 NPU 版本的 MMCV。
+
+以下展示单机四卡场景的运行指令:
+
+```shell
+bash tools/dist_train.sh configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py 4
+```
+
+以下展示单机单卡下的运行指令:
+
+```shell
+python tools/train.py configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py
+```
+
+## 模型验证结果
+
+|        Model        | mIoU  | Config                                                                                                                                     | Download                                                                                                                                    |
+| :-----------------: | :---: | :----------------------------------------------------------------------------------------------------------------------------------------- | :------------------------------------------------------------------------------------------------------------------------------------------ |
+|   [deeplabv3](<>)   | 78.85 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py)         | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024_20230115_205626.json)     |
+| [deeplabv3plus](<>) | 79.23 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/deeplabv3plus/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024.py) | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/deeplabv3plus_r50-d8_4xb2-40k_cityscapes-512x1024_20230116_043450.json) |
+|     [hrnet](<>)     | 78.1  | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/hrnet/fcn_hr18_4xb2-40k_cityscapes-512x1024.py)                     | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/fcn_hr18_4xb2-40k_cityscapes-512x1024_20230116_215821.json)             |
+|      [fcn](<>)      | 74.15 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py)                     | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/fcn_r50-d8_4xb2-40k_cityscapes-512x1024_20230111_083014.json)           |
+|     [icnet](<>)     | 69.25 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/icnet/icnet_r50-d8_4xb2-80k_cityscapes-832x832.py)                  | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/icnet_r50-d8_4xb2-80k_cityscapes-832x832_20230119_002929.json)          |
+|    [pspnet](<>)     | 77.21 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/pspnet/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024.py)              | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/pspnet_r50b-d8_4xb2-80k_cityscapes-512x1024_20230114_042721.json)       |
+|     [unet](<>)      | 68.86 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/unet/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py)              | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024_20230129_224750.json)     |
+|    [upernet](<>)    | 77.81 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py)                | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/upernet_r50_4xb2-40k_cityscapes-512x1024_20230129_014634.json)          |
+|    [apcnet](<>)     | 78.02 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/apcnet/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py)               | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/apcnet_r50-d8_4xb2-40k_cityscapes-512x1024_20230209_212545.json)        |
+|   [bisenetv1](<>)   | 76.04 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/bisenetv1/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024.py)      | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/bisenetv1_r50-d32_4xb4-160k_cityscapes-1024x1024_20230201_023946.json)  |
+|   [bisenetv2](<>)   | 72.44 | [config](https://github.com/open-mmlab/mmsegmentation/tree/1.x/configs/bisenetv2/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024.py)      | [log](https://download.openmmlab.com/mmsegmentation/v0.5/device/npu/bisenetv2_fcn_4xb4-amp-160k_cityscapes-1024x1024_20230205_215606.json)  |
+
+**注意:**
+
+- 如果没有特别标记，NPU 上的使用混合精度训练的结果与使用 FP32 的 GPU 上的结果相同。
+
+**以上模型结果由华为昇腾团队提供**
diff --git a/docs/zh_cn/faq.md b/docs/zh_cn/faq.md
deleted file mode 100644
index fa35b2d84a..0000000000
--- a/docs/zh_cn/faq.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# 常见问题解答（FAQ）
-
-我们在这里列出了使用时的一些常见问题及其相应的解决方案。 如果您发现有一些问题被遗漏，请随时提 PR 丰富这个列表。 如果您无法在此获得帮助，请使用 [issue模板](https://github.com/open-mmlab/mmsegmentation/blob/master/.github/ISSUE_TEMPLATE/error-report.md/)创建问题，但是请在模板中填写所有必填信息，这有助于我们更快定位问题。
-
-## 如何获知模型训练时需要的显卡数量
-
-- 看模型的config文件的命名。可以参考[学习配置文件](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/zh_cn/tutorials/config.md)中的`配置文件命名风格`部分。比如，对于名字为`segformer_mit-b0_8x1_1024x1024_160k_cityscapes.py`的config文件，`8x1`代表训练其对应的模型需要的卡数为8，每张卡中的batch size为1。
-- 看模型的log文件。点开该模型的log文件，并在其中搜索`nGPU`，在`nGPU`后的数字个数即训练时所需的卡数。比如，在log文件中搜索`nGPU`得到`nGPU 0,1,2,3,4,5,6,7`的记录，则说明训练该模型需要使用八张卡。
diff --git a/docs/zh_cn/get_started.md b/docs/zh_cn/get_started.md
index 3b92a2ee2d..ca375f3d9d 100644
--- a/docs/zh_cn/get_started.md
+++ b/docs/zh_cn/get_started.md
@@ -1,246 +1,209 @@
-## 依赖
-
-- Linux or macOS (Windows下支持需要 mmcv-full，但运行时可能会有一些问题。)
-- Python 3.6+
-- PyTorch 1.3+
-- CUDA 9.2+ (如果您基于源文件编译 PyTorch, CUDA 9.0也可以使用)
-- GCC 5+
-- [MMCV](https://mmcv.readthedocs.io/en/latest/#installation)
-
-可编译的 MMSegmentation 和 MMCV 版本如下所示，请对照对应版本安装以避免安装问题。
-
-| MMSegmentation 版本 |          MMCV 版本          |  MMClassification 版本  |
-| :-----------------: | :-------------------------: | :---------------------: |
-|       master        | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0 |
-|       0.24.1        | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0 |
-|       0.23.0        | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0 |
-|       0.22.0        | mmcv-full>=1.4.4, \<=1.6.0  | mmcls>=0.20.1, \<=1.0.0 |
-|       0.21.1        | mmcv-full>=1.4.4, \<=1.6.0  |      Not required       |
-|       0.20.2        | mmcv-full>=1.3.13, \<=1.6.0 |      Not required       |
-|       0.19.0        | mmcv-full>=1.3.13, \<1.3.17 |      Not required       |
-|       0.18.0        | mmcv-full>=1.3.13, \<1.3.17 |      Not required       |
-|       0.17.0        | mmcv-full>=1.3.7, \<1.3.17  |      Not required       |
-|       0.16.0        | mmcv-full>=1.3.7, \<1.3.17  |      Not required       |
-|       0.15.0        | mmcv-full>=1.3.7, \<1.3.17  |      Not required       |
-|       0.14.1        | mmcv-full>=1.3.7, \<1.3.17  |      Not required       |
-|       0.14.0        |  mmcv-full>=1.3.1, \<1.3.2  |      Not required       |
-|       0.13.0        |  mmcv-full>=1.3.1, \<1.3.2  |      Not required       |
-|       0.12.0        |  mmcv-full>=1.1.4, \<1.3.2  |      Not required       |
-|       0.11.0        |  mmcv-full>=1.1.4, \<1.3.0  |      Not required       |
-|       0.10.0        |  mmcv-full>=1.1.4, \<1.3.0  |      Not required       |
-|        0.9.0        |  mmcv-full>=1.1.4, \<1.3.0  |      Not required       |
-|        0.8.0        |  mmcv-full>=1.1.4, \<1.2.0  |      Not required       |
-|        0.7.0        |  mmcv-full>=1.1.2, \<1.2.0  |      Not required       |
-|        0.6.0        |  mmcv-full>=1.1.2, \<1.2.0  |      Not required       |
-
-注意: 如果您已经安装好 mmcv， 您首先需要运行 `pip uninstall mmcv`。
-如果 mmcv 和 mmcv-full 同时被安装，会报错 `ModuleNotFoundError`。
+# 开始：安装和运行 MMSeg
 
-## 安装
-
-a. 创建一个 conda 虚拟环境并激活它
+## 预备知识
 
-```shell
-conda create -n open-mmlab python=3.10 -y
-conda activate open-mmlab
+本教程中，我们将会演示如何使用 PyTorch 准备环境。
 
-```
+MMSegmentation 可以在 Linux, Windows 和 macOS 系统上运行，并且需要安装 Python 3.7+, CUDA 10.2+ 和 PyTorch 1.8+
 
-b. 按照[官方教程](https://pytorch.org/) 安装 PyTorch 和 totchvision，
-这里我们使用 PyTorch1.11.0 和 CUDA11.3，
-您也可以切换至其他版本
-
-```shell
-conda install pytorch=1.11.0 torchvision cudatoolkit=11.3 -c pytorch
-```
+**注意:**
+如果您已经安装了 PyTorch, 可以跳过该部分，直接到[下一小节](##安装)。否则，您可以按照以下步骤操作。
 
-c. 按照 [官方教程](https://mmcv.readthedocs.io/en/latest/#installation)
-安装 [MMCV](https://mmcv.readthedocs.io/en/latest/) ，
-`mmcv` 或 `mmcv-full` 和 MMSegmentation 均兼容，但对于 CCNet 和 PSANet，`mmcv-full` 里的 CUDA 运算是必须的
+**步骤 0.** 从[官方网站](https://docs.conda.io/en/latest/miniconda.html)下载并安装 Miniconda
 
-**在 Linux 下安装 mmcv：**
-
-为了安装 MMCV, 我们推荐使用下面的这种预编译好的 MMCV.
+**步骤 1.** 创建一个 conda 环境，并激活
 
 ```shell
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/{cu_version}/{torch_version}/index.html
+conda create --name openmmlab python=3.8 -y
+conda activate openmmlab
 ```
 
-请替换 url 里面的 `{cu_version}` 和 `{torch_version}` 为您想要使用的版本. mmcv-full 仅在
-PyTorch 1.x.0 上面编译, 因为在 1.x.0 和 1.x.1 之间通常是兼容的. 如果您的 PyTorch 版本是 1.x.1,
-您可以安装用 PyTorch 1.x.0 编译的 mmcv-full 而它通常是可以正常使用的.
-例如, 用 `CUDA 11.1` and `PyTorch 1.11.0` 安装使用 `mmcv-full`, 使用如下命令:
+**Step 2.** 参考 [official instructions](https://pytorch.org/get-started/locally/) 安装 PyTorch
+
+在 GPU 平台上：
 
 ```shell
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11/index.html
+conda install pytorch torchvision -c pytorch
 ```
 
-请查看 [这里](https://github.com/open-mmlab/mmcv#installation) 来找到适配不同 PyTorch 和 CUDA 版本的 MMCV.
-
-您也可以采用下面的命令来从源码编译 MMCV (可选)
+在 CPU 平台上
 
 ```shell
-git clone https://github.com/open-mmlab/mmcv.git
-cd mmcv
-MMCV_WITH_OPS=1 pip install -e .  # package mmcv-full, which contains cuda ops, will be installed after this step
-# OR pip install -e .  # package mmcv, which contains no cuda ops, will be installed after this step
-cd ..
+conda install pytorch torchvision cpuonly -c pytorch
 ```
 
-**重点:** 如果您已经安装了 MMCV, 您需要先运行 `pip uninstall mmcv`. 因为如果 `mmcv` 和 `mmcv-full` 被同时安装, 将会报错 `ModuleNotFoundError`.
+## 安装
 
-**在 Windows 下安装 mmcv (有风险)：**
+我们建议用户遵循我们的最佳实践来安装 MMSegmentation 。但是整个过程是高度自定义的。更多信息请参见[自定义安装](##自定义安装)部分。
 
-对于 Windows， MMCV 的安装需要本地 C++ 编译工具， 例如 cl.exe。 请添加编译工具至 %PATH%。
+### 最佳实践
 
-如果您已经在电脑上安装好Windows SDK 和 Visual Studio，cl.exe 的一个典型路径看起来如下：
+**步骤 0.** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMCV](https://github.com/open-mmlab/mmcv)
 
 ```shell
-C:\Program Files (x86)\Microsoft Visual Studio\2019\Professional\VC\Tools\MSVC\14.26.28801\bin\Hostx86\x64
+pip install -U openmim
+mim install mmengine
+mim install "mmcv>=2.0.0"
 ```
 
-或者您需要从网上下载 cl 编译工具并安装至路径。
+**步骤 1.** 安装 MMSegmentation
 
-随后，从 github 克隆 mmcv 并通过 pip 安装：
+情况 a: 如果您想立刻开发和运行 mmsegmentation，您可通过源码安装：
 
 ```shell
-git clone https://github.com/open-mmlab/mmcv.git
-cd mmcv
-pip install -e .
+git clone -b main https://github.com/open-mmlab/mmsegmentation.git
+cd mmsegmentation
+pip install -v -e .
+# '-v' 表示详细模式，更多的输出
+# '-e' 表示以可编辑模式安装工程，
+# 因此对代码所做的任何修改都生效，无需重新安装
 ```
 
-或直接:
+情况 b: 如果您把 mmsegmentation 作为依赖库或者第三方库，可以通过 pip 安装：
 
 ```shell
-pip install mmcv
+pip install "mmsegmentation>=1.0.0"
 ```
 
-当前，mmcv-full 并不完全在 windows 上支持。
+### 验证是否安装成功
+
+为了验证 MMSegmentation 是否正确安装，我们提供了一些示例代码来运行一个推理 demo 。
 
-d. 安装 MMSegmentation
+**步骤 1.** 下载配置文件和模型文件
 
 ```shell
-pip install mmsegmentation # 安装最新版本
+mim download mmsegmentation --config pspnet_r50-d8_4xb2-40k_cityscapes-512x1024 --dest .
 ```
 
-或者
+该下载过程可能需要花费几分钟，这取决于您的网络环境。当下载结束，您将看到以下两个文件在您当前工作目录：`pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py` 和 `pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth`
 
-```shell
-pip install git+https://github.com/open-mmlab/mmsegmentation.git # 安装 master 分支
-```
+**步骤 2.** 验证推理 demo
 
-此外，如果您想安装 `dev` 模式的 MMSegmentation, 运行如下命令：
+选项 (a). 如果您通过源码安装了 mmsegmentation，运行以下命令即可：
 
 ```shell
-git clone https://github.com/open-mmlab/mmsegmentation.git
-cd mmsegmentation
-pip install -e .  # 或者 "python setup.py develop"
+python demo/image_demo.py demo/demo.png configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth --device cuda:0 --out-file result.jpg
 ```
 
-注意:
+您将在当前文件夹中看到一个新图像 `result.jpg`，其中所有目标都覆盖了分割 mask
 
-1. 当在 windows 下训练和测试模型时，请确保路径下所有的'\\' 被替换成 '/'，
-   在 python 代码里可以使用`.replace('\\', '/')`处理路径的字符串
-2. `version+git_hash` 也将被保存进 meta 训练模型里，即0.5.0+c415a2e
-3. 当 MMsegmentation 以 `dev` 模式被安装时，本地对代码的修改将不需要重新安装即可产生作用
-4. 如果您想使用 `opencv-python-headless` 替换 `opencv-python`，您可以在安装 MMCV 前安装它
-5. 一些依赖项是可选的。简单的运行 `pip install -e .` 将仅安装最必要的一些依赖。为了使用可选的依赖项如`cityscapessripts`，
-   要么手动使用 `pip install -r requirements/optional.txt` 安装，要么专门从pip下安装(即 `pip install -e .[optional]`，
-   其中选项可设置为 `all`, `tests`, `build`, 和 `optional`)
+选项 (b). 如果您通过 pip 安装 mmsegmentation, 打开您的 python 解释器，复制粘贴以下代码：
 
-### 完整的安装脚本
+```python
+from mmseg.apis import inference_model, init_model, show_result_pyplot
+import mmcv
 
-#### Linux
+config_file = 'pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+checkpoint_file = 'pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
 
-这里便是一个完整安装 MMSegmentation 的脚本，使用 conda 并链接了数据集的路径（以您的数据集路径为 $DATA_ROOT 来安装）。
+# 根据配置文件和模型文件建立模型
+model = init_model(config_file, checkpoint_file, device='cuda:0')
 
-```shell
-conda create -n open-mmlab python=3.10 -y
-conda activate open-mmlab
+# 在单张图像上测试并可视化
+img = 'demo/demo.png'  # or img = mmcv.imread(img), 这样仅需下载一次
+result = inference_model(model, img)
+# 在新的窗口可视化结果
+show_result_pyplot(model, img, result, show=True)
+# 或者将可视化结果保存到图像文件夹中
+# 您可以修改分割 map 的透明度 (0, 1].
+show_result_pyplot(model, img, result, show=True, out_file='result.jpg', opacity=0.5)
+# 在一段视频上测试并可视化分割结果
+video = mmcv.VideoReader('video.mp4')
+for frame in video:
+   result = inference_model(model, frame)
+   show_result_pyplot(model, frame, result, wait_time=1)
+```
 
-conda install pytorch=1.11.0 torchvision cudatoolkit=11.3 -c pytorch
-pip install mmcv-full -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.11.0/index.html
-git clone https://github.com/open-mmlab/mmsegmentation.git
-cd mmsegmentation
-pip install -e .  # 或者 "python setup.py develop"
+您可以修改上面的代码来测试单个图像或视频，这两个选项都可以验证安装是否成功。
 
-mkdir data
-ln -s $DATA_ROOT data
-```
+### 自定义安装
 
-#### Windows (有风险)
+#### CUDA 版本
 
-这里便是一个完整安装 MMSegmentation 的脚本，使用 conda 并链接了数据集的路径（以您的数据集路径为 %DATA_ROOT% 来安装）。
-注意：它必须是一个绝对路径。
+当安装 PyTorch 的时候，您需要指定 CUDA 的版本， 如果您不确定选择哪个版本，请遵循我们的建议：
 
-```shell
-conda create -n open-mmlab python=3.10 -y
-conda activate open-mmlab
+- 对于基于 Ampere 的 NVIDIA GPUs, 例如 GeForce 30 系列和 NVIDIA A100, 必须要求是 CUDA 11.
+- 对于更老的 NVIDIA GPUs, CUDA 11 is backward compatible, but CUDA 10.2 提供了更好的兼容性，以及更加的轻量化
 
-conda install pytorch=1.11.0 torchvision cudatoolkit=11.3 -c pytorch
-set PATH=full\path\to\your\cpp\compiler;%PATH%
-pip install mmcv
+请确保 GPU 驱动满足最小的版本需求。详情请参考这个[表格](https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html#cuda-major-component-versions__table-cuda-toolkit-driver-versions)
 
-git clone https://github.com/open-mmlab/mmsegmentation.git
-cd mmsegmentation
-pip install -e .  # 或者 "python setup.py develop"
+**注意:**
+如果您按照我们的最佳实践，安装 CUDA 运行库就足够了，因为不需要 CUDA 代码在本地编译。 但是如果您希望从源码编译 MMCV 或者需要开发其他的 CUDA 算子，您需要从 NVIDIA 的[官网](https://developer.nvidia.com/cuda-downloads)安装完整的 CUDA 工具，同时它的版本需要与 PyTorch 的 CUDA 版本匹配。即 `conda install` 命令中指定的 cudatoolkit 版本。
+
+#### 不使用 MIM 安装 MMCV
+
+MMCV 包含 C++ 和 CUDA 扩展，因此与 PyTorch 的依赖方式比较复杂。MIM 自动解决了这种依赖关系，使安装更容易。然而，MIM 也并不是必须的。
+
+为了使用 pip 而不是 MIM 安装 MMCV, 请参考 [MMCV 安装指南](https://mmcv.readthedocs.io/en/latest/get_started/installation.html). 这需要手动指定一个基于 PyTorch 版本及其 CUDA 版本的 find-url.
+
+例如，以下命令可为 PyTorch 1.10.x and CUDA 11.3 安装 mmcv==2.0.0
 
-mklink /D data %DATA_ROOT%
+```shell
+pip install mmcv==2.0.0 -f https://download.openmmlab.com/mmcv/dist/cu113/torch1.10/index.html
 ```
 
-#### 使用多版本 MMSegmentation 进行开发
+#### 在仅有 CPU 的平台安装
+
+MMSegmentation 可以在仅有 CPU 的版本上运行。在 CPU 模式，您可以训练（需要 MMCV 版本 >= 2.0.0），测试和推理模型。
+
+#### 在 Google Colab 上安装
 
-训练和测试脚本已经修改了 `PYTHONPATH` 来确保使用当前路径的MMSegmentation。
+[Google Colab](https://research.google.com/) 通常已经安装了 PyTorch，因此我们仅需要通过以下命令安装 MMCV 和 MMSegmentation。
 
-为了使用当前环境默认安装的 MMSegmentation 而不是正在工作的 MMSegmentation，您可以在那些脚本里移除下面的内容：
+**步骤 1.** 使用 [MIM](https://github.com/open-mmlab/mim) 安装 [MMCV](https://github.com/open-mmlab/mmcv)
 
 ```shell
-PYTHONPATH="$(dirname $0)/..":$PYTHONPATH
+!pip3 install openmim
+!mim install mmengine
+!mim install "mmcv>=2.0.0"
 ```
 
-## 验证
+**Step 2.** 通过源码安装 MMSegmentation
+
+```shell
+!git clone https://github.com/open-mmlab/mmsegmentation.git
+%cd mmsegmentation
+!git checkout main
+!pip install -e .
+```
 
-为了验证 MMSegmentation 和它所需要的环境是否正确安装，我们可以使用样例 python 代码来初始化一个 segmentor 并推理一张 demo 图像。
+**Step 3.** 验证
 
 ```python
-from mmseg.apis import inference_model, init_model
-import mmcv
+import mmseg
+print(mmseg.__version__)
+# 示例输出: 1.0.0
+```
 
-config_file = 'configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py'
-checkpoint_file = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
+**注意:**
+在 Jupyter 中, 感叹号 `!` 用于调用外部可执行命令，`%cd` 是一个 [magic command](https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-cd) 可以改变当前 python 的工作目录。
 
-# 从一个 config 配置文件和 checkpoint 文件里创建分割模型
-model = init_model(config_file, checkpoint_file, device='cuda:0')
+### 通过 Docker 使用 MMSegmentation
 
-# 测试一张样例图片并得到结果
-img = 'test.jpg'  # 或者 img = mmcv.imread(img), 这将只加载图像一次．
-result = inference_model(model, img)
-# 在新的窗口里可视化结果
-model.show_result(img, result, show=True)
-# 或者保存图片文件的可视化结果
-# 您可以改变 segmentation map 的不透明度(opacity)，在(0, 1]之间。
-model.show_result(img, result, out_file='result.jpg', opacity=0.5)
+我们提供了一个 [Dockerfile](https://github.com/open-mmlab/mmsegmentation/blob/master/docker/Dockerfile) 来建立映像。确保您的 [docker 版本](https://docs.docker.com/engine/install/) >=19.03.
 
-# 测试一个视频并得到分割结果
-video = mmcv.VideoReader('video.mp4')
-for frame in video:
-   result = inference_model(model, frame)
-   model.show_result(frame, result, wait_time=1)
+```shell
+# 通过 PyTorch 1.11, CUDA 11.3 建立映像
+# 如果您使用其他版本，修改 Dockerfile 即可
+docker build -t mmsegmentation docker/
 ```
 
-当您完成 MMSegmentation 的安装时，上述代码应该可以成功运行。
-
-我们还提供一个 demo 脚本去可视化单张图片。
+运行：
 
 ```shell
-python demo/image_demo.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} [--device ${DEVICE_NAME}] [--palette-thr ${PALETTE}]
+docker run --gpus all --shm-size=8g -it -v {DATA_DIR}:/mmsegmentation/data mmsegmentation
 ```
 
-样例：
+### 可选依赖
+
+#### 安装 GDAL
+
+[GDAL](https://gdal.org/) 是一个用于栅格和矢量地理空间数据格式的转换库。安装 GDAL 可以读取复杂格式和极大的遥感图像。
 
 ```shell
-python demo/image_demo.py demo/demo.jpg configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-    checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth --device cuda:0 --palette cityscapes
+conda install GDAL
 ```
 
-推理的 demo 文档可在此查询：[demo/inference_demo.ipynb](../demo/inference_demo.ipynb) 。
+## 问题解答
+
+如果您在安装过程中遇到了其他问题，请第一时间查阅 [FAQ](notes/faq.md) 文件。如果没有找到答案，您也可以在 GitHub 上提出 [issue](https://github.com/open-mmlab/mmsegmentation/issues/new/choose)
diff --git a/docs/zh_cn/imgs/qq_group_qrcode.jpg b/docs/zh_cn/imgs/qq_group_qrcode.jpg
deleted file mode 100644
index 417347449f..0000000000
Binary files a/docs/zh_cn/imgs/qq_group_qrcode.jpg and /dev/null differ
diff --git a/docs/zh_cn/imgs/seggroup_qrcode.jpg b/docs/zh_cn/imgs/seggroup_qrcode.jpg
deleted file mode 100644
index 9684582ee1..0000000000
Binary files a/docs/zh_cn/imgs/seggroup_qrcode.jpg and /dev/null differ
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 4ac9211b62..ce5e49977d 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -8,36 +8,34 @@
    get_started.md
 
 .. toctree::
-   :maxdepth: 1
-   :caption: 数据集准备
+   :maxdepth: 2
+   :caption: 用户指南
 
-   dataset_prepare.md
+   user_guides/index.rst
 
 .. toctree::
-   :maxdepth: 1
-   :caption: 模型库
+   :maxdepth: 2
+   :caption: 进阶指南
 
-   model_zoo.md
-   modelzoo_statistics.md
+   advanced_guides/index.rst
 
 .. toctree::
-   :maxdepth: 2
-   :caption: 快速启动
+   :maxdepth: 1
+   :caption: 迁移指引
 
-   train.md
-   inference.md
+   migration/index.rst
 
 .. toctree::
-   :maxdepth: 2
-   :caption: 教程
+   :caption: 接口文档（英文）
 
-   tutorials/index.rst
+   api.rst
 
 .. toctree::
-   :maxdepth: 2
-   :caption: 实用工具与脚本
+   :maxdepth: 1
+   :caption: 模型库
 
-   useful_tools.md
+   model_zoo.md
+   modelzoo_statistics.md
 
 .. toctree::
    :maxdepth: 2
@@ -51,10 +49,6 @@
 
    switch_language.md
 
-.. toctree::
-   :caption: 接口文档（英文）
-
-   api.rst
 
 Indices and tables
 ==================
diff --git a/docs/zh_cn/inference.md b/docs/zh_cn/inference.md
deleted file mode 100644
index a9bd9b04f6..0000000000
--- a/docs/zh_cn/inference.md
+++ /dev/null
@@ -1,127 +0,0 @@
-## 使用预训练模型推理
-
-我们提供测试脚本来评估完整数据集（Cityscapes, PASCAL VOC, ADE20k 等）上的结果，同时为了使其他项目的整合更容易，也提供一些高级 API。
-
-### 测试一个数据集
-
-- 单卡 GPU
-- CPU
-- 单节点多卡 GPU
-- 多节点
-
-您可以使用以下命令来测试一个数据集。
-
-```shell
-# 单卡 GPU 测试
-python tools/test.py ${配置文件} ${检查点文件} [--out ${结果文件}] [--eval ${评估指标}] [--show]
-
-# CPU: 如果机器没有 GPU, 则跟上述单卡 GPU 测试一致
-# CPU: 如果机器有 GPU, 那么先禁用 GPU 再运行单 GPU 测试脚本
-export CUDA_VISIBLE_DEVICES=-1 # 禁用 GPU
-python tools/test.py ${配置文件} ${检查点文件} [--out ${结果文件}] [--eval ${评估指标}] [--show]
-
-# 多卡GPU 测试
-./tools/dist_test.sh ${配置文件} ${检查点文件} ${GPU数目} [--out ${结果文件}] [--eval ${评估指标}]
-```
-
-可选参数:
-
-- `RESULT_FILE`: pickle 格式的输出结果的文件名，如果不专门指定，结果将不会被专门保存成文件。（MMseg v0.17 之后，args.out 将只会保存评估时的中间结果或者是分割图的保存路径。）
-- `EVAL_METRICS`: 在结果里将被评估的指标。这主要取决于数据集，  `mIoU`  对于所有数据集都可获得，像 Cityscapes 数据集可以通过 `cityscapes` 命令来专门评估，就像标准的 `mIoU`一样。
-- `--show`: 如果被指定，分割结果将会在一张图像里画出来并且在另一个窗口展示。它仅仅是用来调试与可视化，并且仅针对单卡 GPU 测试。请确认 GUI 在您的环境里可用，否则您也许会遇到报错 `cannot connect to X server`
-- `--show-dir`: 如果被指定，分割结果将会在一张图像里画出来并且保存在指定文件夹里。它仅仅是用来调试与可视化，并且仅针对单卡GPU测试。使用该参数时，您的环境不需要 GUI。
-- `--eval-options`: 评估时的可选参数，当设置 `efficient_test=True` 时，它将会保存中间结果至本地文件里以节约 CPU 内存。请确认您本地硬盘有足够的存储空间（大于20GB）。（MMseg v0.17 之后，`efficient_test` 不再生效，我们重构了 test api，通过使用一种渐近式的方式来提升评估和保存结果的效率。）
-
-例子:
-
-假设您已经下载检查点文件至文件夹 `checkpoints/` 里。
-
-1. 测试 PSPNet 并可视化结果。按下任何键会进行到下一张图
-
-   ```shell
-   python tools/test.py configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-       checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-       --show
-   ```
-
-2. 测试 PSPNet 并保存画出的图以便于之后的可视化
-
-   ```shell
-   python tools/test.py configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-       checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-       --show-dir psp_r50_512x1024_40ki_cityscapes_results
-   ```
-
-3. 在数据集 PASCAL VOC (不保存测试结果) 上测试 PSPNet 并评估 mIoU
-
-   ```shell
-   python tools/test.py configs/pspnet/pspnet_r50-d8_512x1024_20k_voc12aug.py \
-       checkpoints/pspnet_r50-d8_512x1024_20k_voc12aug_20200605_003338-c57ef100.pth \
-       --eval mAP
-   ```
-
-4. 使用4卡 GPU 测试 PSPNet，并且在标准 mIoU 和 cityscapes 指标里评估模型
-
-   ```shell
-   ./tools/dist_test.sh configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-       checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-       4 --out results.pkl --eval mIoU cityscapes
-   ```
-
-   注意：在 cityscapes mIoU 和我们的 mIoU 指标会有一些差异 (~0.1%) 。因为 cityscapes 默认是根据类别样本数的多少进行加权平均，而我们对所有的数据集都是采取直接平均的方法来得到 mIoU。
-
-5. 在 cityscapes 数据集上4卡 GPU 测试 PSPNet， 并生成 png 文件以便提交给官方评估服务器
-
-   首先，在配置文件里添加内容： `configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py`，
-
-   ```python
-   data = dict(
-       test=dict(
-           img_dir='leftImg8bit/test',
-           ann_dir='gtFine/test'))
-   ```
-
-   随后，进行测试。
-
-   ```shell
-   ./tools/dist_test.sh configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-       checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-       4 --format-only --eval-options "imgfile_prefix=./pspnet_test_results"
-   ```
-
-   您会在文件夹 `./pspnet_test_results` 里得到生成的 png 文件。
-   您也许可以运行 `zip -r results.zip pspnet_test_results/` 并提交 zip 文件给 [evaluation server](https://www.cityscapes-dataset.com/submit/) 。
-
-6. 在 Cityscapes 数据集上使用 CPU 高效内存选项来测试 DeeplabV3+ `mIoU` 指标 (没有保存测试结果)
-
-   ```shell
-   python tools/test.py \
-   configs/deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py \
-   deeplabv3plus_r18-d8_512x1024_80k_cityscapes_20201226_080942-cff257fe.pth \
-   --eval-options efficient_test=True \
-   --eval mIoU
-   ```
-
-   使用 `pmap` 可查看 CPU 内存情况,  `efficient_test=True` 会使用约 2.25GB 的 CPU 内存， `efficient_test=False` 会使用约 11.06GB 的 CPU 内存。 这个可选参数可以节约很多 CPU 内存。（MMseg v0.17 之后, `efficient_test` 参数将不再生效, 我们使用了一种渐近的方式来更加有效快速地评估和保存结果。）
-
-7. 在 LoveDA 数据集上1卡 GPU 测试 PSPNet， 并生成 png 文件以便提交给官方评估服务器
-
-   首先，在配置文件里添加内容： `configs/pspnet/pspnet_r50-d8_512x512_80k_loveda.py`,
-
-   ```python
-   data = dict(
-       test=dict(
-           img_dir='img_dir/test',
-           ann_dir='ann_dir/test'))
-   ```
-
-   随后，进行测试。
-
-   ```shell
-   python ./tools/test.py configs/pspnet/pspnet_r50-d8_512x512_80k_loveda.py \
-       checkpoints/pspnet_r50-d8_512x512_80k_loveda_20211104_155728-88610f9f.pth \
-       --format-only --eval-options "imgfile_prefix=./pspnet_test_results"
-   ```
-
-   您会在文件夹 `./pspnet_test_results` 里得到生成的 png 文件。
-   您也许可以运行 `zip -r -j Results.zip pspnet_test_results/` 并提交 zip 文件给 [evaluation server](https://codalab.lisn.upsaclay.fr/competitions/421) 。
diff --git a/docs/zh_cn/migration/index.rst b/docs/zh_cn/migration/index.rst
new file mode 100644
index 0000000000..854b9e61d0
--- /dev/null
+++ b/docs/zh_cn/migration/index.rst
@@ -0,0 +1,8 @@
+迁移
+***************
+
+.. toctree::
+    :maxdepth: 1
+
+    interface.md
+    package.md
diff --git a/docs/zh_cn/migration/interface.md b/docs/zh_cn/migration/interface.md
new file mode 100644
index 0000000000..42f91bf50a
--- /dev/null
+++ b/docs/zh_cn/migration/interface.md
@@ -0,0 +1,523 @@
+# 从 MMSegmentation 0.x 迁移
+
+## 引言
+
+本指南介绍了 MMSegmentation 0.x 和 MMSegmentation1.x 在表现和 API 方面的基本区别，以及这些与迁移过程的关系。
+
+## 新的依赖
+
+MMSegmentation 1.x 依赖于一些新的软件包，您可以准备一个新的干净环境，然后根据[安装教程](../get_started.md)重新安装。
+
+或手动安装以下软件包。
+
+1. [MMEngine](https://github.com/open-mmlab/mmengine)：MMEngine 是 OpenMMLab 2.0 架构的核心，我们将许多与计算机视觉无关的内容从 MMCV 拆分到 MMEngine 中。
+
+2. [MMCV](https://github.com/open-mmlab/mmcv)：OpenMMLab 的计算机视觉包。这不是一个新的依赖，但您需要将其升级到 **2.0.0** 或以上的版本。
+
+3. [MMClassification](https://github.com/open-mmlab/mmclassification)（可选）：OpenMMLab 的图像分类工具箱和基准。这不是一个新的依赖，但您需要将其升级到 **1.0.0rc6** 版本。
+
+4. [MMDetection](https://github.com/open-mmlab/mmdetection)(可选): OpenMMLab 的目标检测工具箱和基准。这不是一个新的依赖，但您需要将其升级到 **3.0.0** 或以上的版本。
+
+## 启动训练
+
+OpenMMLab 2.0 的主要改进是发布了 MMEngine，它为启动训练任务的统一接口提供了通用且强大的执行器。
+
+与 MMSeg 0.x 相比，MMSeg 1.x 在 `tools/train.py` 中提供的命令行参数更少
+
+<table class="docutils">
+<tr>
+<td>功能</td>
+<td>原版</td>
+<td>新版</td>
+</tr>
+<tr>
+<td>加载预训练模型</td>
+<td>--load_from=$CHECKPOINT</td>
+<td>--cfg-options load_from=$CHECKPOINT</td>
+</tr>
+<tr>
+<td>从特定检查点恢复训练</td>
+<td>--resume-from=$CHECKPOINT</td>
+<td>--resume=$CHECKPOINT</td>
+</tr>
+<tr>
+<td>从最新的检查点恢复训练</td>
+<td>--auto-resume</td>
+<td>--resume='auto'</td>
+</tr>
+<tr>
+<td>训练期间是否不评估检查点</td>
+<td>--no-validate</td>
+<td>--cfg-options val_cfg=None val_dataloader=None val_evaluator=None</td>
+</tr>
+<tr>
+<td>指定训练设备</td>
+<td>--gpu-id=$DEVICE_ID</td>
+<td>-</td>
+</tr>
+<tr>
+<td>是否为不同进程设置不同的种子</td>
+<td>--diff-seed</td>
+<td>--cfg-options randomness.diff_rank_seed=True</td>
+</tr>
+<td>是否为 CUDNN 后端设置确定性选项</td>
+<td>--deterministic</td>
+<td>--cfg-options randomness.deterministic=True</td>
+</table>
+
+## 测试启动
+
+与训练启动类似，MMSegmentation 1.x 的测试启动脚本在 tools/test.py 中仅提供关键命令行参数，以下是测试启动脚本的区别，更多关于测试启动的细节请参考[这里](../user_guides/4_train_test.md)。
+
+<table class="docutils">
+<tr>
+<td>功能</td>
+<td>0.x</td>
+<td>1.x</td>
+</tr>
+<tr>
+<td>指定评测指标</td>
+<td>--eval mIoU</td>
+<td>--cfg-options test_evaluator.type=IoUMetric</td>
+</tr>
+<tr>
+<td>测试时数据增强</td>
+<td>--aug-test</td>
+<td>--tta</td>
+</tr>
+<tr>
+<td>测试时是否只保存预测结果不计算评测指标</td>
+<td>--format-only</td>
+<td>--cfg-options test_evaluator.format_only=True</td>
+</tr>
+</table>
+
+## 配置文件
+
+### 模型设置
+
+`model.backend`、`model.neck`、`model.decode_head` 和 `model.loss` 字段没有更改。
+
+添加 `model.data_preprocessor` 字段以配置 `DataPreProcessor`，包括：
+
+- `mean`（Sequence，可选）：R、G、B 通道的像素平均值。默认为 None。
+
+- `std`（Sequence，可选）：R、G、B 通道的像素标准差。默认为 None。
+
+- `size`（Sequence，可选）：固定的填充大小。
+
+- `size_divisor`（int，可选）：填充图像可以被当前值整除。
+
+- `seg_pad_val`（float，可选）：分割图的填充值。默认值：255。
+
+- `padding_mode`（str）：填充类型。默认值：'constant'。
+
+  - constant：常量值填充，值由 pad_val 指定。
+
+- `bgr_to_rgb`（bool）：是否将图像从 BGR 转换为 RGB。默认为 False。
+
+- `rgb_to_bgr`（bool）：是否将图像从 RGB 转换为 BGR。默认为 False。
+
+**注：**
+有关详细信息，请参阅[模型文档](../advanced_guides/models.md)。
+
+### 数据集设置
+
+**data** 的更改：
+
+原版 `data` 字段被拆分为 `train_dataloader`、`val_dataloader` 和 `test_dataloader`，允许我们以细粒度配置它们。例如，您可以在训练和测试期间指定不同的采样器和批次大小。
+`samples_per_gpu` 重命名为 `batch_size`。
+`workers_per_gpu` 重命名为 `num_workers`。
+
+<table class="docutils">
+<tr>
+<td>原版</td>
+<td>
+
+```python
+data = dict(
+    samples_per_gpu=4,
+    workers_per_gpu=4,
+    train=dict(...),
+    val=dict(...),
+    test=dict(...),
+)
+```
+
+</td>
+<tr>
+<td>新版</td>
+<td>
+
+```python
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=True)  # 必须
+)
+
+val_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    dataset=dict(...),
+    sampler=dict(type='DefaultSampler', shuffle=False)  # 必须
+)
+
+test_dataloader = val_dataloader
+```
+
+</td>
+</tr>
+</table>
+
+**数据增强变换流程**变更
+
+- 原始格式转换 **`ToTensor`**、**`ImageToTensor`**、**`Collect`** 组合为 [`PackSegInputs`](mmseg.datasets.transforms.PackSegInputs)
+- 我们不建议在数据集流程中执行 **`Normalize`** 和 **Pad**。请将其从流程中删除，并将其设置在 `data_preprocessor` 字段中。
+- MMSeg 1.x 中原始的 **`Resize`** 已更改为 **`RandomResize `**，输入参数 `img_scale` 重命名为 `scale`，`keep_ratio` 的默认值修改为 False。
+- 原始的 `test_pipeline` 将单尺度和多尺度测试结合在一起，在 MMSeg 1.x 中，我们将其分为 `test_pipeline` 和 `tta_pipeline`。
+
+**注：**
+我们将一些数据转换工作转移到数据预处理器中，如归一化，请参阅[文档](package.md)了解更多详细信息。
+
+训练流程
+
+<table class="docutils">
+<tr>
+<td>原版</td>
+<td>
+
+```python
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='Resize', img_scale=(2560, 640), ratio_range=(0.5, 2.0)),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='Normalize', **img_norm_cfg),
+    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
+    dict(type='DefaultFormatBundle'),
+    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
+]
+```
+
+</td>
+<tr>
+<td>新版</td>
+<td>
+
+```python
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2560, 640),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+```
+
+</td>
+</tr>
+</table>
+
+测试流程
+
+<table class="docutils">
+<tr>
+<td>原版</td>
+<td>
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(
+        type='MultiScaleFlipAug',
+        img_scale=(2560, 640),
+        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
+        flip=False,
+        transforms=[
+            dict(type='Resize', keep_ratio=True),
+            dict(type='RandomFlip'),
+            dict(type='Normalize', **img_norm_cfg),
+            dict(type='ImageToTensor', keys=['img']),
+            dict(type='Collect', keys=['img']),
+        ])
+]
+```
+
+</td>
+<tr>
+<td>新版</td>
+<td>
+
+```python
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2560, 640), keep_ratio=True),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+```
+
+</td>
+</tr>
+</table>
+
+**`evaluation`** 中的更改：
+
+- **`evaluation`** 字段被拆分为 `val_evaluator` 和 `test_evaluator `。而且不再支持 `interval` 和 `save_best` 参数。
+  `interval` 已移动到 `train_cfg.val_interval`，`save_best` 已移动到 `default_hooks.checkpoint.save_best`。`pre_eval` 已删除。
+- `IoU` 已更改为 `IoUMetric`。
+
+<table class="docutils">
+<tr>
+<td>原版</td>
+<td>
+
+```python
+evaluation = dict(interval=2000, metric='mIoU', pre_eval=True)
+```
+
+</td>
+<tr>
+<td>新版</td>
+<td>
+
+```python
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
+```
+
+</td>
+</tr>
+</table>
+
+### Optimizer 和 Schedule 设置
+
+**`optimizer`** 和 **`optimizer_config`** 中的更改：
+
+- 现在我们使用 `optim_wrapper` 字段来指定优化过程的所有配置。以及 `optimizer` 是 `optim_wrapper` 的一个子字段。
+- `paramwise_cfg` 也是 `optim_wrapper` 的一个子字段，以替代 `optimizer`。
+- `optimizer_config` 现在被删除，它的所有配置都被移动到 `optim_wrapper` 中。
+- `grad_clip` 重命名为 `clip_grad`。
+
+<table class="docutils">
+<tr>
+<td>原版</td>
+<td>
+
+```python
+optimizer = dict(type='AdamW', lr=0.0001, weight_decay=0.0005)
+optimizer_config = dict(grad_clip=dict(max_norm=1, norm_type=2))
+```
+
+</td>
+<tr>
+<td>新版</td>
+<td>
+
+```python
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=0.0001, weight_decay=0.0005),
+    clip_grad=dict(max_norm=1, norm_type=2))
+```
+
+</td>
+</tr>
+</table>
+
+**`lr_config`** 中的更改：
+
+- 我们将 `lr_config` 字段删除，并使用新的 `param_scheduler` 替代。
+- 我们删除了与 `warmup` 相关的参数，因为我们使用 scheduler 组合来实现该功能。
+
+新的 scheduler 组合机制非常灵活，您可以使用它来设计多种学习率/动量曲线。有关详细信息，请参见[教程](TODO)。
+
+<table class="docutils">
+<tr>
+<td>原版</td>
+<td>
+
+```python
+lr_config = dict(
+    policy='poly',
+    warmup='linear',
+    warmup_iters=1500,
+    warmup_ratio=1e-6,
+    power=1.0,
+    min_lr=0.0,
+    by_epoch=False)
+```
+
+</td>
+<tr>
+<td>新版</td>
+<td>
+
+```python
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=160000,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+```
+
+</td>
+</tr>
+</table>
+
+**`runner`** 中的更改：
+
+原版 `runner` 字段中的大多数配置被移动到 `train_cfg`、`val_cfg` 和 `test_cfg` 中，以在训练、验证和测试中配置 loop。
+
+<table class="docutils">
+<tr>
+<td>原版</td>
+<td>
+
+```python
+runner = dict(type='IterBasedRunner', max_iters=20000)
+```
+
+</td>
+<tr>
+<td>新版</td>
+<td>
+
+```python
+# `val_interval` 是旧版本的 `evaluation.interval`。
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=20000, val_interval=2000)
+val_cfg = dict(type='ValLoop') # 使用默认的验证循环。
+test_cfg = dict(type='TestLoop') # 使用默认的测试循环。
+```
+
+</td>
+</tr>
+</table>
+
+事实上，在 OpenMMLab 2.0 中，我们引入了 `Loop` 来控制训练、验证和测试中的行为。`Runner` 的功能也发生了变化。您可以在 [MMMEngine](https://github.com/open-mmlab/mmengine/) 的[执行器教程](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/design/runner.md) 中找到更多的详细信息。
+
+### 运行时设置
+
+**`checkpoint_config`** 和 **`log_config`** 中的更改：
+
+`checkpoint_config` 被移动到 `default_hooks.checkpoint` 中，`log_config` 被移动到 `default_hooks.logger` 中。
+并且我们将许多钩子设置从脚本代码移动到运行时配置的 `default_hooks` 字段中。
+
+```python
+default_hooks = dict(
+    # 记录每次迭代的时间。
+    timer=dict(type='IterTimerHook'),
+
+    # 每50次迭代打印一次日志。
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+
+    # 启用参数调度程序。
+    param_scheduler=dict(type='ParamSchedulerHook'),
+
+    # 每2000次迭代保存一次检查点。
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+
+    # 在分布式环境中设置采样器种子。
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+
+    # 验证结果可视化。
+    visualization=dict(type='SegVisualizationHook'))
+```
+
+此外，我们将原版 logger 拆分为 logger 和 visualizer。logger 用于记录信息，visualizer 用于在不同的后端显示 logger，如 terminal 和 TensorBoard。
+
+<table class="docutils">
+<tr>
+<td>原版</td>
+<td>
+
+```python
+log_config = dict(
+    interval=100,
+    hooks=[
+        dict(type='TextLoggerHook'),
+        dict(type='TensorboardLoggerHook'),
+    ])
+```
+
+</td>
+<tr>
+<td>新版</td>
+<td>
+
+```python
+default_hooks = dict(
+    ...
+    logger=dict(type='LoggerHook', interval=100),
+)
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
+
+</td>
+</tr>
+</table>
+
+**`load_from`** 和 **`resume_from`** 中的更改：
+
+- 删除 `resume_from`。我们使用 `resume` 和 `load_from` 来替换它。
+  - 如果 `resume=True` 且 `load_from` 为 **not None**，则从 `load_from` 中的检查点恢复训练。
+  - 如果 `resume=True` 且 `load_from` 为 **None**，则尝试从工作目录中的最新检查点恢复。
+  - 如果 `resume=False` 且 `load_from` 为 **not None**，则只加载检查点，而不继续训练。
+  - 如果 `resume=False` 且 `load_from` 为 **None**，则不加载或恢复。
+
+**`dist_params`** 中的更改：`dist_params` 字段现在是 `env_cfg` 的子字段。并且 `env_cfg` 中还有一些新的配置。
+
+```python
+env_cfg = dict(
+    # 是否启用 cudnn_benchmark
+    cudnn_benchmark=False,
+
+    # 设置多进程参数
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+
+    # 设置分布式参数
+    dist_cfg=dict(backend='nccl'),
+)
+```
+
+**`workflow`** 的改动:`workflow` 相关功能被删除。
+
+新字段 **`visualizer`**：visualizer 是 OpenMMLab 2.0 体系结构中的新设计。我们在 runner 中使用 visualizer 实例来处理结果和日志可视化，并保存到不同的后端。更多详细信息，请参阅[可视化教程](../user_guides/visualization.md)。
+
+新字段 **`default_scope`**：搜索所有注册模块的起点。MMSegmentation 中的 `default_scope` 为 `mmseg`。请参见[注册器教程](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/registry.md)了解更多详情。
diff --git a/docs/zh_cn/migration/package.md b/docs/zh_cn/migration/package.md
new file mode 100644
index 0000000000..19e5f18c9c
--- /dev/null
+++ b/docs/zh_cn/migration/package.md
@@ -0,0 +1,113 @@
+# 包结构更改
+
+本节包含您对 MMSeg 0.x 和 1.x 之间的变化可能感到好奇的内容。
+
+<table>
+<tr>
+<td>MMSegmentation 0.x</td>
+<td>MMSegmentation 1.x</td>
+</tr>
+<tr>
+<td>mmseg.api</td>
+<td>mmseg.api</td>
+</tr>
+<tr>
+<td bgcolor=#fcf7f7>- mmseg.core</td>
+<td bgcolor=#ecf4eb>+ mmseg.engine</td>
+</tr>
+<tr>
+<td>mmseg.datasets</td>
+<td>mmseg.datasets</td>
+</tr>
+<tr>
+<td>mmseg.models</td>
+<td>mmseg.models</td>
+</tr>
+<tr>
+<td bgcolor=#fcf7f7>- mmseg.ops</td>
+<td bgcolor=#ecf4eb>+ mmseg.structure</td>
+</tr>
+<tr>
+<td>mmseg.utils</td>
+<td>mmseg.utils</td>
+</tr>
+<tr>
+<td></td>
+<td bgcolor=#ecf4eb>+ mmseg.evaluation</td>
+</tr>
+<tr>
+<td></td>
+<td bgcolor=#ecf4eb>+ mmseg.registry</td>
+<tr>
+</table>
+
+## 已删除的包
+
+### `mmseg.core`
+
+在 OpenMMLab 2.0 中，`core` 包已被删除。`core` 的 `hooks` 和 `optimizers` 被移动到了 `mmseg.engine` 中，而 `core` 中的 `evaluation` 目前是 mmseg.evaluation。
+
+## `mmseg.ops`
+
+`ops` 包含 `encoding` 和 `wrappers`，它们被移到了 `mmseg.models.utils` 中。
+
+## 增加的包
+
+### `mmseg.engine`
+
+OpenMMLab 2.0 增加了一个新的深度学习训练基础库 MMEngine。它是所有 OpenMMLab 代码库的训练引擎。
+mmseg 的 `engine` 包是一些用于语义分割任务的定制模块，如 `SegVisualizationHook` 用于可视化分割掩膜。
+
+### `mmseg.structure`
+
+在 OpenMMLab 2.0 中，我们为计算机视觉任务设计了数据结构，在 mmseg 中，我们在 `structure` 包中实现了 `SegDataSample`。
+
+### `mmseg.evaluation`
+
+我们将所有评估指标都移动到了 `mmseg.evaluation` 中。
+
+### `mmseg.registry`
+
+我们将 MMSegmentation 中所有类型模块的注册实现移动到 `mmseg.registry` 中。
+
+## 修改的包
+
+### `mmseg.apis`
+
+OpenMMLab 2.0 尝试支持计算机视觉的多任务统一接口，并发布了更强的 [`Runner`](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/design/runner.md)，因此 MMSeg 1.x 删除了 `train.py` 和 `test.py` 中的模块，并将 `init_segmentor` 重命名为 `init_model`，将 `inference_segmentor` 重命名为 `inference_model`。
+
+以下是 `mmseg.apis` 的更改：
+
+|         函数          | 变化                                           |
+| :-------------------: | :--------------------------------------------- |
+|   `init_segmentor`    | 重命名为 `init_model`                          |
+| `inference_segmentor` | 重命名为 `inference_model`                     |
+| `show_result_pyplot`  | 基于 `SegLocalVisualizer` 实现                 |
+|     `train_model`     | 删除，使用 `runner.train` 训练。               |
+|   `multi_gpu_test`    | 删除，使用 `runner.test` 测试。                |
+|   `single_gpu_test`   | 删除，使用 `runner.test` 测试。                |
+|   `set_random_seed`   | 删除，使用 `mmengine.runner.set_random_seed`。 |
+|  `init_random_seed`   | 删除，使用 `mmengine.dist.sync_random_seed`。  |
+
+### `mmseg.datasets`
+
+OpenMMLab 2.0 将 `BaseDataset` 定义为数据集的函数和接口，MMSegmentation 1.x 也遵循此协议，并定义了从 `BaseDataset` 继承的 `BaseSegDataset`。MMCV 2.x 收集多种任务的通用数据转换，例如分类、检测、分割，因此 MMSegmentation 1.x 使用这些数据转换并将其从 mmseg.dataset 中删除。
+
+|        包/模块        | 更改                                                                                 |
+| :-------------------: | :----------------------------------------------------------------------------------- |
+|   `mmseg.pipelines`   | 移动到 `mmcv.transforms` 中                                                          |
+|    `mmseg.sampler`    | 移动到 `mmengine.dataset.sampler` 中                                                 |
+|    `CustomDataset`    | 重命名为 `BaseSegDataset` 并从 MMEngine 中的 `BaseDataset` 继承                      |
+| `DefaultFormatBundle` | 替换为 `PackSegInputs`                                                               |
+|  `LoadImageFromFile`  | 移动到 `mmcv.transforms.LoadImageFromFile` 中                                        |
+|   `LoadAnnotations`   | 移动到 `mmcv.transforms.LoadAnnotations` 中                                          |
+|       `Resize`        | 移动到 `mmcv.transforms` 中并拆分为 `Resize`，`RandomResize` 和 `RandomChoiceResize` |
+|     `RandomFlip`      | 移动到 `mmcv.transforms.RandomFlip` 中                                               |
+|         `Pad`         | 移动到 `mmcv.transforms.Pad` 中                                                      |
+|      `Normalize`      | 移动到 `mmcv.transforms.Normalize` 中                                                |
+|       `Compose`       | 移动到 `mmcv.transforms.Compose` 中                                                  |
+|    `ImageToTensor`    | 移动到 `mmcv.transforms.ImageToTensor` 中                                            |
+
+### `mmseg.models`
+
+`models` 没有太大变化，只是从以前的 `mmseg.ops` 添加了 `encoding` 和 `wrappers`
diff --git a/docs/zh_cn/model_zoo.md b/docs/zh_cn/model_zoo.md
index b9a0986517..bd5721579f 100644
--- a/docs/zh_cn/model_zoo.md
+++ b/docs/zh_cn/model_zoo.md
@@ -121,7 +121,7 @@
 
 请参考 [Mixed Precision (FP16) Training 在 BiSeNetV2 训练的样例](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv2/bisenetv2_fcn_fp16_4x4_1024x1024_160k_cityscapes.py) 获得详细信息。
 
-## 速度标定
+## 速度标定（待更新）
 
 ### 硬件
 
diff --git a/docs/zh_cn/modelzoo_statistics.md b/docs/zh_cn/modelzoo_statistics.md
new file mode 100644
index 0000000000..b057575a25
--- /dev/null
+++ b/docs/zh_cn/modelzoo_statistics.md
@@ -0,0 +1,102 @@
+# 模型库统计数据
+
+- 论文数量: 47
+
+  - ALGORITHM: 36
+  - BACKBONE: 11
+
+- 模型数量: 612
+
+  - \[ALGORITHM\] [ANN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ann) (16 ckpts)
+
+  - \[ALGORITHM\] [APCNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/apcnet) (12 ckpts)
+
+  - \[BACKBONE\] [BEiT](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/beit) (2 ckpts)
+
+  - \[ALGORITHM\] [BiSeNetV1](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv1) (11 ckpts)
+
+  - \[ALGORITHM\] [BiSeNetV2](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/bisenetv2) (4 ckpts)
+
+  - \[ALGORITHM\] [CCNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ccnet) (16 ckpts)
+
+  - \[ALGORITHM\] [CGNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/cgnet) (2 ckpts)
+
+  - \[BACKBONE\] [ConvNeXt](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/convnext) (6 ckpts)
+
+  - \[ALGORITHM\] [DANet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/danet) (16 ckpts)
+
+  - \[ALGORITHM\] [DeepLabV3](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3) (41 ckpts)
+
+  - \[ALGORITHM\] [DeepLabV3+](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/deeplabv3plus) (42 ckpts)
+
+  - \[ALGORITHM\] [DMNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dmnet) (12 ckpts)
+
+  - \[ALGORITHM\] [DNLNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dnlnet) (12 ckpts)
+
+  - \[ALGORITHM\] [DPT](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/dpt) (1 ckpts)
+
+  - \[ALGORITHM\] [EMANet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/emanet) (4 ckpts)
+
+  - \[ALGORITHM\] [EncNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/encnet) (12 ckpts)
+
+  - \[ALGORITHM\] [ERFNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/erfnet) (1 ckpts)
+
+  - \[ALGORITHM\] [FastFCN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastfcn) (12 ckpts)
+
+  - \[ALGORITHM\] [Fast-SCNN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fastscnn) (1 ckpts)
+
+  - \[ALGORITHM\] [FCN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/fcn) (41 ckpts)
+
+  - \[ALGORITHM\] [GCNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/gcnet) (16 ckpts)
+
+  - \[BACKBONE\] [HRNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/hrnet) (37 ckpts)
+
+  - \[ALGORITHM\] [ICNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/icnet) (12 ckpts)
+
+  - \[ALGORITHM\] [ISANet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/isanet) (16 ckpts)
+
+  - \[ALGORITHM\] [K-Net](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/knet) (7 ckpts)
+
+  - \[BACKBONE\] [MAE](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mae) (1 ckpts)
+
+  - \[ALGORITHM\] [Mask2Former](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mask2former) (13 ckpts)
+
+  - \[ALGORITHM\] [MaskFormer](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/maskformer) (4 ckpts)
+
+  - \[BACKBONE\] [MobileNetV2](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v2) (8 ckpts)
+
+  - \[BACKBONE\] [MobileNetV3](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/mobilenet_v3) (4 ckpts)
+
+  - \[ALGORITHM\] [NonLocal Net](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/nonlocal_net) (16 ckpts)
+
+  - \[ALGORITHM\] [OCRNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/ocrnet) (24 ckpts)
+
+  - \[ALGORITHM\] [PointRend](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/point_rend) (4 ckpts)
+
+  - \[BACKBONE\] [PoolFormer](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/poolformer) (5 ckpts)
+
+  - \[ALGORITHM\] [PSANet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/psanet) (16 ckpts)
+
+  - \[ALGORITHM\] [PSPNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/pspnet) (54 ckpts)
+
+  - \[BACKBONE\] [ResNeSt](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/resnest) (8 ckpts)
+
+  - \[ALGORITHM\] [SegFormer](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segformer) (13 ckpts)
+
+  - \[ALGORITHM\] [Segmenter](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/segmenter) (5 ckpts)
+
+  - \[ALGORITHM\] [Semantic FPN](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/sem_fpn) (4 ckpts)
+
+  - \[ALGORITHM\] [SETR](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/setr) (7 ckpts)
+
+  - \[ALGORITHM\] [STDC](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/stdc) (4 ckpts)
+
+  - \[BACKBONE\] [Swin Transformer](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/swin) (6 ckpts)
+
+  - \[BACKBONE\] [Twins](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/twins) (12 ckpts)
+
+  - \[ALGORITHM\] [UNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/unet) (25 ckpts)
+
+  - \[ALGORITHM\] [UPerNet](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/upernet) (16 ckpts)
+
+  - \[BACKBONE\] [Vision Transformer](https://github.com/open-mmlab/mmsegmentation/blob/master/configs/vit) (11 ckpts)
diff --git a/docs/zh_cn/notes/faq.md b/docs/zh_cn/notes/faq.md
new file mode 100644
index 0000000000..aa99c259c8
--- /dev/null
+++ b/docs/zh_cn/notes/faq.md
@@ -0,0 +1,125 @@
+# 常见问题解答（FAQ）
+
+我们在这里列出了使用时的一些常见问题及其相应的解决方案。 如果您发现有一些问题被遗漏，请随时提 PR 丰富这个列表。 如果您无法在此获得帮助，请使用 [issue 模板](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/.github/ISSUE_TEMPLATE/error-report.md/)创建问题，但是请在模板中填写所有必填信息，这有助于我们更快定位问题。
+
+## 安装
+
+兼容的 MMSegmentation 和 MMCV 版本如下。请安装正确版本的 MMCV 以避免安装问题。
+
+| MMSegmentation version |          MMCV version          | MMEngine version  | MMClassification (optional) version | MMDetection (optional) version |
+| :--------------------: | :----------------------------: | :---------------: | :---------------------------------: | :----------------------------: |
+|     dev-1.x branch     |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|      main branch       |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.2.2          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.2.1          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.2.0          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.1.2          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.1.1          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.1.0          |         mmcv >= 2.0.0          | MMEngine >= 0.7.4 |        mmpretrain>=1.0.0rc7         |         mmdet >= 3.0.0         |
+|         1.0.0          |        mmcv >= 2.0.0rc4        | MMEngine >= 0.7.1 |           mmcls==1.0.0rc6           |         mmdet >= 3.0.0         |
+|        1.0.0rc6        |        mmcv >= 2.0.0rc4        | MMEngine >= 0.5.0 |           mmcls>=1.0.0rc0           |       mmdet >= 3.0.0rc6        |
+|        1.0.0rc5        |        mmcv >= 2.0.0rc4        | MMEngine >= 0.2.0 |           mmcls>=1.0.0rc0           |        mmdet>=3.0.0rc6         |
+|        1.0.0rc4        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4, \<=3.0.0rc5  |
+|        1.0.0rc3        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4, \<=3.0.0rc5  |
+|        1.0.0rc2        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4, \<=3.0.0rc5  |
+|        1.0.0rc1        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |          Not required          |
+|        1.0.0rc0        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |          Not required          |
+
+如果您已经安装了版本不合适的 mmcv，请先运行`pip uninstall mmcv`卸载已安装的 mmcv，如您先前安装的为 mmcv-full（存在于 OpenMMLab 1.x），请运行`pip uninstall mmcv-full`进行卸载。
+
+- 如出现 "No module named 'mmcv'"
+  1. 使用`pip uninstall mmcv`卸载环境中现有的 mmcv
+  2. 按照[安装说明](../get_started.md)安装对应的 mmcv
+
+## 如何获知模型训练时需要的显卡数量
+
+- 看模型的 config 文件命名。可以参考[了解配置文件](../user_guides/1_config.md)中的`配置文件命名风格`部分。比如，对于名字为`segformer_mit-b0_8xb1-160k_cityscapes-1024x1024.py`的 config 文件，`8xb1`代表训练其对应的模型需要的卡数为 8，每张卡中的 batch size 为 1。
+- 看模型的 log 文件。点开该模型的 log 文件，并在其中搜索`nGPU`，在`nGPU`后的数字个数即训练时所需的卡数。比如，在 log 文件中搜索`nGPU`得到`nGPU 0,1,2,3,4,5,6,7`的记录，则说明训练该模型需要使用八张卡。
+
+## auxiliary head 是什么
+
+简单来说，这是一个提高准确率的深度监督技术。在训练阶段，`decode_head`用于输出语义分割的结果，`auxiliary_head` 只是增加了一个辅助损失，其产生的分割结果对你的模型结果没有影响，仅在在训练中起作用。您可以阅读这篇[论文](https://arxiv.org/pdf/1612.01105.pdf)了解更多信息。
+
+## 运行测试脚本时如何输出绘制分割掩膜的图像
+
+在测试脚本中，我们提供了`--out`参数来控制是否输出保存预测的分割掩膜图像。您可以运行以下命令输出测试结果：
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --out ${OUTPUT_DIR}
+```
+
+更多用例细节可查阅[文档](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/docs/zh_cn/user_guides/4_train_test.md#%E6%B5%8B%E8%AF%95%E5%B9%B6%E4%BF%9D%E5%AD%98%E5%88%86%E5%89%B2%E7%BB%93%E6%9E%9C)，[PR #2712](https://github.com/open-mmlab/mmsegmentation/pull/2712) 以及[迁移文档](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/docs/zh_cn/migration/interface.md#%E6%B5%8B%E8%AF%95%E5%90%AF%E5%8A%A8)了解相关说明。
+
+## 如何处理二值分割任务?
+
+MMSegmentation 使用 `num_classes` 和 `out_channels` 来控制模型最后一层 `self.conv_seg` 的输出。更多细节可以参考 [这里](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/mmseg/models/decode_heads/decode_head.py)。
+
+`num_classes` 应该和数据集本身类别个数一致，当是二值分割时，数据集只有前景和背景两类，所以 `num_classes` 为 2. `out_channels` 控制模型最后一层的输出的通道数，通常和 `num_classes` 相等，但当二值分割时候，可以有两种处理方法, 分别是：
+
+- 设置 `out_channels=2`，在训练时以 Cross Entropy Loss 作为损失函数，在推理时使用 `F.softmax()` 归一化 logits 值，然后通过 `argmax()` 得到每个像素的预测结果。
+
+- 设置 `out_channels=1`，在训练时以 Binary Cross Entropy Loss 作为损失函数，在推理时使用 `F.sigmoid()` 和 `threshold` 得到预测结果，`threshold` 默认为 0.3。
+
+对于实现上述两种计算二值分割的方法，需要在 `decode_head` 和 `auxiliary_head` 的配置里修改。下面是对样例 [pspnet_unet_s5-d16.py](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/_base_/models/pspnet_unet_s5-d16.py) 做出的对应修改。
+
+- (1) `num_classes=2`, `out_channels=2` 并在 `CrossEntropyLoss` 里面设置 `use_sigmoid=False`。
+
+```python
+decode_head=dict(
+    type='PSPHead',
+    in_channels=64,
+    in_index=4,
+    num_classes=2,
+    out_channels=2,
+    loss_decode=dict(
+        type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+auxiliary_head=dict(
+    type='FCNHead',
+    in_channels=128,
+    in_index=3,
+    num_classes=2,
+    out_channels=2,
+    loss_decode=dict(
+        type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+```
+
+- (2) `num_classes=2`, `out_channels=1` 并在 `CrossEntropyLoss` 里面设置 `use_sigmoid=True`.
+
+```python
+decode_head=dict(
+    type='PSPHead',
+    in_channels=64,
+    in_index=4,
+    num_classes=2,
+    out_channels=1,
+    loss_decode=dict(
+        type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0)),
+auxiliary_head=dict(
+    type='FCNHead',
+    in_channels=128,
+    in_index=3,
+    num_classes=2,
+    out_channels=1,
+    loss_decode=dict(
+        type='CrossEntropyLoss', use_sigmoid=True, loss_weight=0.4)),
+```
+
+## `reduce_zero_label` 的作用
+
+数据集中 `reduce_zero_label` 参数类型为布尔类型，默认为 False，它的功能是为了忽略数据集 label 0。具体做法是将 label 0 改为 255，其余 label 相应编号减 1，同时 decode head 里将 255 设为 ignore index，即不参与 loss 计算。
+以下是 `reduce_zero_label` 具体实现逻辑:
+
+```python
+if self.reduce_zero_label:
+    # avoid using underflow conversion
+    gt_semantic_seg[gt_semantic_seg == 0] = 255
+    gt_semantic_seg = gt_semantic_seg - 1
+    gt_semantic_seg[gt_semantic_seg == 254] = 255
+```
+
+关于您的数据集是否需要使用 reduce_zero_label，有以下两类情况：
+
+- 例如在 [Potsdam](https://github.com/open-mmlab/mmsegmentation/blob/1.x/docs/en/user_guides/2_dataset_prepare.md#isprs-potsdam) 数据集上，有 0-不透水面、1-建筑、2-低矮植被、3-树、4-汽车、5-杂乱，六类。但该数据集提供了两种 RGB 标签，一种为图像边缘处有黑色像素的标签，另一种是没有黑色边缘的标签。对于有黑色边缘的标签，在 [dataset_converters.py](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/tools/dataset_converters/potsdam.py)中，其将黑色边缘转换为 label 0，其余标签分别为 1-不透水面、2-建筑、3-低矮植被、4-树、5-汽车、6-杂乱，那么此时，就应该在数据集 [potsdam.py](https://github.com/open-mmlab/mmsegmentation/blob/ff95416c3b5ce8d62b9289f743531398efce534f/mmseg/datasets/potsdam.py#L23) 中将`reduce_zero_label=True`。如果使用的是没有黑色边缘的标签，那么 mask label 中只有 0-5，此时就应该使`reduce_zero_label=False`。需要结合您的实际情况来使用。
+- 例如在第 0 类为 background 类别的数据集上，如果您最终是需要将背景和您的其余类别分开时，是不需要使用`reduce_zero_label`的，此时在数据集中应该将其设置为`reduce_zero_label=False`
+
+**注意:** 使用 `reduce_zero_label` 请确认数据集原始类别个数，如果只有两类，需要关闭 `reduce_zero_label` 即设置 `reduce_zero_label=False`。
diff --git a/docs/zh_cn/overview.md b/docs/zh_cn/overview.md
new file mode 100644
index 0000000000..ed147956d0
--- /dev/null
+++ b/docs/zh_cn/overview.md
@@ -0,0 +1,75 @@
+# 概述
+
+本章节向您介绍 MMSegmentation 框架以及语义分割相关的基本概念。我们还提供了关于 MMSegmentation 的详细教程链接。
+
+## 什么是语义分割？
+
+语义分割是将图像中属于同一目标类别的部分聚类在一起的任务。它也是一种像素级预测任务，因为图像中的每一个像素都将根据类别进行分类。该任务的一些示例基准有 [Cityscapes](https://www.cityscapes-dataset.com/benchmarks/), [PASCAL VOC](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/) 和 [ADE20K](https://groups.csail.mit.edu/vision/datasets/ADE20K/) 。通常用平均交并比 (Mean IoU) 和像素准确率 (Pixel Accuracy) 这两个指标来评估模型。
+
+## 什么是 MMSegmentation?
+
+MMSegmentation 是一个工具箱，它为语义分割任务的统一实现和模型评估提供了一个框架，并且高质量实现了常用的语义分割方法和数据集。
+
+MMSeg 主要包含了 apis, structures, datasets, models, engine, evaluation 和 visualization 这七个主要部分。
+
+- **apis** 提供了模型推理的高级api
+
+- **structures** 提供了分割任务的数据结构 `SegDataSample`
+
+- **datasets** 支持用于语义分割的多种数据集
+
+  - **transforms** 包含多种数据增强变换
+
+- **models** 是分割器最重要的部分，包含了分割器的不同组件
+
+  - **segmentors** 定义了所有分割模型类
+  - **data_preprocessors** 用于预处理模型的输入数据
+  - **backbones** 包含各种骨干网络，可将图像映射为特征图
+  - **necks** 包含各种模型颈部组件，用于连接分割头和骨干网络
+  - **decode_heads** 包含各种分割头，将特征图作为输入，并预测分割结果
+  - **losses** 包含各种损失函数
+
+- **engine** 是运行时组件的一部分，扩展了 [MMEngine](https://github.com/open-mmlab/mmengine) 的功能
+
+  - **optimizers** 提供了优化器和优化器封装
+  - **hooks** 提供了 runner 的各种钩子
+
+- **evaluation** 提供了评估模型性能的不同指标
+
+- **visualization** 分割结果的可视化工具
+
+## 如何使用本指南？
+
+以下是详细步骤，将带您一步步学习如何使用 MMSegmentation :
+
+1. 有关安装说明，请参阅 [开始你的第一步](get_started.md)。
+
+2. 对于初学者来说，MMSegmentation 是开始语义分割之旅的最好选择，因为这里实现了许多 SOTA 模型以及经典的模型 [model](model_zoo.md) 。另外，将各类组件和高级 API 結合使用，可以更便捷的执行分割任务。关于 MMSegmentation 的基本用法，请参考下面的教程：
+
+   - [配置](user_guides/1_config.md)
+   - [数据预处理](user_guides/2_dataset_prepare.md)
+   - [推理](user_guides/3_inference.md)
+   - [训练和测试](user_guides/4_train_test.md)
+
+3. 如果你想了解 MMSegmentation 工作的基本类和功能，请参考下面的教程来深入研究：
+
+   - [数据流](advanced_guides/data_flow.md)
+   - [结构](advanced_guides/structures.md)
+   - [模型](advanced_guides/models.md)
+   - [数据集](advanced_guides/datasets.md)
+   - [评估](advanced_guides/evaluation.md)
+
+4. MMSegmentation 也为用户自定义和一些前沿的研究提供了教程，请参考下面的教程来建立你自己的分割项目：
+
+   - [添加新的模型](advanced_guides/add_models.md)
+   - [添加新的数据集](advanced_guides/add_datasets.md)
+   - [添加新的 transform](advanced_guides/add_transforms.md)
+   - [自定义 runtime](advanced_guides/customize_runtime.md)
+
+5. 如果您更熟悉 MMSegmentation v0.x , 以下是 MMSegmentation v0.x 迁移到 v1.x 的文档
+
+   - [迁移](migration/index.rst)
+
+## 参考来源
+
+- https://paperswithcode.com/task/semantic-segmentation/codeless#task-home
diff --git a/docs/zh_cn/stat.py b/docs/zh_cn/stat.py
index b3a1d73069..7a86302e32 100755
--- a/docs/zh_cn/stat.py
+++ b/docs/zh_cn/stat.py
@@ -18,13 +18,15 @@
 for f in files:
     url = osp.dirname(f.replace('../../', url_prefix))
 
-    with open(f, 'r') as content_file:
+    with open(f) as content_file:
         content = content_file.read()
 
     title = content.split('\n')[0].replace('#', '').strip()
-    ckpts = set(x.lower().strip()
-                for x in re.findall(r'https?://download.*\.pth', content)
-                if 'mmsegmentation' in x)
+    ckpts = {
+        x.lower().strip()
+        for x in re.findall(r'https?://download.*\.pth', content)
+        if 'mmsegmentation' in x
+    }
     if len(ckpts) == 0:
         continue
 
@@ -34,7 +36,7 @@
     assert len(_papertype) > 0
     papertype = _papertype[0]
 
-    paper = set([(papertype, title)])
+    paper = {(papertype, title)}
 
     titles.append(title)
     num_ckpts += len(ckpts)
diff --git a/docs/zh_cn/train.md b/docs/zh_cn/train.md
deleted file mode 100644
index a54f28f03f..0000000000
--- a/docs/zh_cn/train.md
+++ /dev/null
@@ -1,159 +0,0 @@
-## 训练一个模型
-
-MMSegmentation 可以执行分布式训练和非分布式训练，分别使用 `MMDistributedDataParallel` 和 `MMDataParallel` 命令。
-
-所有的输出(日志 log 和检查点 checkpoints )将被保存到工作路径文件夹里，它可以通过配置文件里的 `work_dir` 指定。
-
-在一定迭代轮次后，我们默认在验证集上评估模型表现。您可以在训练配置文件中添加间隔参数来改变评估间隔。
-
-```python
-evaluation = dict(interval=4000)  # 每4000 iterations 评估一次模型的性能
-```
-
-**\*重要提示\***: 在配置文件里的默认学习率是针对4卡 GPU 和2张图/GPU (此时 batchsize = 4x2 = 8)来设置的。
-同样，您也可以使用8卡 GPU 和 1张图/GPU 的设置，因为所有的模型均使用 cross-GPU 的 SyncBN 模式。
-
-我们可以在训练速度和 GPU 显存之间做平衡。当模型或者 Batch Size 比较大的时，可以传递`--cfg-options model.backbone.with_cp=True` ，使用 `with_cp` 来节省显存，但是速度会更慢，因为原先使用 `with_cp` 时，是逐层反向传播(Back Propagation, BP)，不会保存所有的梯度。
-
-### 使用单台机器训练
-
-#### 使用单卡 GPU 训练
-
-```shell
-python tools/train.py ${CONFIG_FILE} [可选参数]
-```
-
-如果您想在命令里定义工作文件夹路径，您可以添加一个参数`--work-dir ${工作路径}`。
-
-#### 使用 CPU 训练
-
-如果计算机没有 GPU，那么使用 CPU 训练的流程和使用单 GPU 训练的流程一致。如果计算机有 GPU 但是想使用 CPU，我们仅需要在训练流程开始前禁用 GPU。
-
-```shell
-export CUDA_VISIBLE_DEVICES=-1
-```
-
-之后运行单 GPU 训练脚本即可。
-
-```{warning}
-我们不推荐用户使用 CPU 进行训练，这太过缓慢。我们支持这个功能是为了方便用户在没有 GPU 的机器上进行调试。
-```
-
-#### 使用多卡 GPU 训练
-
-```shell
-sh tools/dist_train.sh ${CONFIG_FILE} ${GPUS} [可选参数]
-```
-
-可选参数可以为:
-
-- `--no-validate` (**不推荐**): 训练时代码库默认会在每 k 轮迭代后在验证集上进行评估，如果不需评估使用命令 `--no-validate`
-- `--work-dir ${工作路径}`: 在配置文件里重写工作路径文件夹
-- `--resume-from ${检查点文件}`: 继续使用先前的检查点 (checkpoint) 文件（可以继续训练过程）
-- `--load-from ${检查点文件}`: 从一个检查点 (checkpoint) 文件里加载权重（对另一个任务进行精调）
-- `--deterministic`: 选择此模式会减慢训练速度，但结果易于复现
-
-`resume-from` 和 `load-from` 的区别:
-
-- `resume-from` 加载出模型权重和优化器状态包括迭代轮数等
-- `load-from` 仅加载模型权重，从第0轮开始训练
-
-示例:
-
-```shell
-# 模型的权重和日志将会存储在这个路径下： WORK_DIR=work_dirs/pspnet_r50-d8_512x512_80k_ade20k/
-# 如果work_dir没有被设定，它将会被自动生成
-sh tools/dist_train.sh configs/pspnet/pspnet_r50-d8_512x512_80k_ade20k.py 8 --work_dir work_dirs/pspnet_r50-d8_512x512_80k_ade20k/ --deterministic
-```
-
-**注意**: 在训练时，模型的和日志保存在“work_dirs/”下的配置文件的相同文件夹结构中。不建议使用自定义的“work_dirs/”，因为验证脚本可以从配置文件名中推断工作目录。如果你想在其他地方保存模型的权重，请使用符号链接，例如:
-
-```shell
-ln -s ${YOUR_WORK_DIRS} ${MMSEG}/work_dirs
-```
-
-#### 在单个机器上启动多个任务
-
-如果您在单个机器上启动多个任务，例如在8卡 GPU 的一个机器上有2个4卡 GPU 的训练任务，您需要特别对每个任务指定不同的端口（默认为29500）来避免通讯冲突。否则，将会有报错信息 `RuntimeError: Address already in use`。
-
-如果您使用命令 `dist_train.sh` 来启动一个训练任务，您可以在命令行的用环境变量 `PORT` 设置端口:
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 sh tools/dist_train.sh ${CONFIG_FILE} 4
-CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 sh tools/dist_train.sh ${CONFIG_FILE} 4
-```
-
-### 使用多台机器训练
-
-如果您想使用由 ethernet 连接起来的多台机器， 您可以使用以下命令:
-
-在第一台机器上:
-
-```shell
-NNODES=2 NODE_RANK=0 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
-```
-
-在第二台机器上:
-
-```shell
-NNODES=2 NODE_RANK=1 PORT=$MASTER_PORT MASTER_ADDR=$MASTER_ADDR sh tools/dist_train.sh $CONFIG $GPUS
-```
-
-但是，如果您不使用高速网路连接这几台机器的话，训练将会非常慢。
-
-### 使用slurm管理任务
-
-Slurm是一个很好的计算集群作业调度系统。在由Slurm管理的集群中，可以使用slurm_train.sh来进行训练。它同时支持单节点和多节点训练。
-
-在多台机器上训练：
-
-```shell
-[GPUS=${GPUS}] sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} ${CONFIG_FILE} --work-dir ${WORK_DIR}
-```
-
-这里有一个在dev分区上使用16块GPUs来训练PSPNet的例子:
-
-```shell
-GPUS=16 sh tools/slurm_train.sh dev pspr50 configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py work_dirs/pspnet_r50-d8_512x1024_40k_cityscapes/
-```
-
-当使用 `slurm_train.sh` 在一个节点上启动多个任务时，需要指定不同的端口号，这里提供了三种设置:
-
-方式1：
-
-在`config1.py`中设置:
-
-```python
-dist_params = dict(backend='nccl', port=29500)
-```
-
-在`config2.py`中设置:
-
-```python
-dist_params = dict(backend='nccl', port=29501)
-```
-
-然后就可以使用config1.py和config2.py启动两个作业:
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py tmp_work_dir_1
-CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py tmp_work_dir_2
-```
-
-方式2:
-
-您可以设置不同的通信端口，而不需要修改配置文件，但必须设置“cfg-options”，以覆盖配置文件中的默认端口。
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py tmp_work_dir_1 --cfg-options dist_params.port=29500
-CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py tmp_work_dir_2 --cfg-options dist_params.port=29501
-```
-
-方式3:
-
-您可以使用环境变量’ MASTER_PORT ‘在命令中设置端口:
-
-```shell
-CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 MASTER_PORT=29500 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config1.py tmp_work_dir_1
-CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 MASTER_PORT=29501 sh tools/slurm_train.sh ${PARTITION} ${JOB_NAME} config2.py tmp_work_dir_2
-```
diff --git a/docs/zh_cn/tutorials/config.md b/docs/zh_cn/tutorials/config.md
deleted file mode 100644
index 7cee611269..0000000000
--- a/docs/zh_cn/tutorials/config.md
+++ /dev/null
@@ -1,377 +0,0 @@
-# 教程 1: 学习配置文件
-
-我们整合了模块和继承设计到我们的配置里，这便于做很多实验。如果您想查看配置文件，您可以运行 `python tools/print_config.py /PATH/TO/CONFIG` 去查看完整的配置文件。您还可以传递参数
-`--cfg-options xxx.yyy=zzz` 去查看更新的配置。
-
-## 配置文件的结构
-
-在 `config/_base_` 文件夹下面有4种基本组件类型： 数据集(dataset)，模型(model)，训练策略(schedule)和运行时的默认设置(default runtime)。许多方法都可以方便地通过组合这些组件进行实现。
-这样，像 DeepLabV3, PSPNet 这样的模型可以容易地被构造。被来自 `_base_` 下的组件来构建的配置叫做 _原始配置 (primitive)_。
-
-对于所有在同一个文件夹下的配置文件，推荐**只有一个**对应的**原始配置**文件。所有其他的配置文件都应该继承自这个**原始配置**文件。这样就能保证配置文件的最大继承深度为 3。
-
-为了便于理解，我们推荐社区贡献者继承已有的方法配置文件。
-例如，如果一些修改是基于 DeepLabV3，使用者首先首先应该通过指定 `_base_ = ../deeplabv3/deeplabv3_r50_512x1024_40ki_cityscapes.py`来继承基础 DeepLabV3 结构，再去修改配置文件里其他内容以完成继承。
-
-如果您正在构建一个完整的新模型，它完全没有和已有的方法共享一些结构，您可能需要在 `configs` 下面创建一个文件夹 `xxxnet`。
-更详细的文档，请参照 [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html) 。
-
-## 配置文件命名风格
-
-我们按照下面的风格去命名配置文件，社区贡献者被建议使用同样的风格。
-
-```
-{model}_{backbone}_[misc]_[gpu x batch_per_gpu]_{resolution}_{iterations}_{dataset}
-```
-
-`{xxx}` 是被要求的文件 `[yyy]` 是可选的。
-
-- `{model}`: 模型种类，例如 `psp`， `deeplabv3` 等等
-- `{backbone}`: 主干网络种类，例如 `r50` (ResNet-50)， `x101` (ResNeXt-101)
-- `[misc]`: 模型中各式各样的设置/插件，例如 `dconv`， `gcb`， `attention`， `mstrain`
-- `[gpu x batch_per_gpu]`: GPU数目 和每个 GPU 的样本数， 默认为 `8x2`
-- `{iterations}`: 训练迭代轮数，如`160k`
-- `{dataset}`: 数据集，如 `cityscapes`， `voc12aug`， `ade`
-
-## PSPNet 的一个例子
-
-为了帮助使用者熟悉这个流行的语义分割框架的完整配置文件和模块，我们在下面对使用 ResNet50V1c 的 PSPNet 的配置文件做了详细的注释说明。
-更多的详细使用和其他模块的替代项请参考 API 文档。
-
-```python
-norm_cfg = dict(type='SyncBN', requires_grad=True)  # 分割框架通常使用 SyncBN
-model = dict(
-    type='EncoderDecoder',  # 分割器(segmentor)的名字
-    pretrained='open-mmlab://resnet50_v1c',  # 将被加载的 ImageNet 预训练主干网络
-    backbone=dict(
-        type='ResNetV1c',  # 主干网络的类别。 可用选项请参考 mmseg/models/backbones/resnet.py
-        depth=50,  # 主干网络的深度。通常为 50 和 101。
-        num_stages=4,  # 主干网络状态(stages)的数目，这些状态产生的特征图作为后续的 head 的输入。
-        out_indices=(0, 1, 2, 3),  # 每个状态产生的特征图输出的索引。
-        dilations=(1, 1, 2, 4),  # 每一层(layer)的空心率(dilation rate)。
-        strides=(1, 2, 1, 1),  # 每一层(layer)的步长(stride)。
-        norm_cfg=dict(  # 归一化层(norm layer)的配置项。
-            type='SyncBN',  # 归一化层的类别。通常是 SyncBN。
-            requires_grad=True),   # 是否训练归一化里的 gamma 和 beta。
-        norm_eval=False,  # 是否冻结 BN 里的统计项。
-        style='pytorch',  # 主干网络的风格，'pytorch' 意思是步长为2的层为 3x3 卷积， 'caffe' 意思是步长为2的层为 1x1 卷积。
-        contract_dilation=True),  # 当空洞 > 1, 是否压缩第一个空洞层。
-    decode_head=dict(
-        type='PSPHead',  # 解码头(decode head)的类别。 可用选项请参考 mmseg/models/decode_heads。
-        in_channels=2048,  # 解码头的输入通道数。
-        in_index=3,  # 被选择的特征图(feature map)的索引。
-        channels=512,  # 解码头中间态(intermediate)的通道数。
-        pool_scales=(1, 2, 3, 6),  # PSPHead 平均池化(avg pooling)的规模(scales)。 细节请参考文章内容。
-        dropout_ratio=0.1,  # 进入最后分类层(classification layer)之前的 dropout 比例。
-        num_classes=19,  # 分割前景的种类数目。 通常情况下，cityscapes 为19，VOC为21，ADE20k 为150。
-        norm_cfg=dict(type='SyncBN', requires_grad=True),  # 归一化层的配置项。
-        align_corners=False,  # 解码里调整大小(resize)的 align_corners 参数。
-        loss_decode=dict(  # 解码头(decode_head)里的损失函数的配置项。
-            type='CrossEntropyLoss',  # 在分割里使用的损失函数的类别。
-            use_sigmoid=False,  # 在分割里是否使用 sigmoid 激活。
-            loss_weight=1.0)),  # 解码头里损失的权重。
-    auxiliary_head=dict(
-        type='FCNHead',  # 辅助头(auxiliary head)的种类。可用选项请参考 mmseg/models/decode_heads。
-        in_channels=1024,  # 辅助头的输入通道数。
-        in_index=2,  # 被选择的特征图(feature map)的索引。
-        channels=256,  # 辅助头中间态(intermediate)的通道数。
-        num_convs=1,  # FCNHead 里卷积(convs)的数目. 辅助头里通常为1。
-        concat_input=False,  # 在分类层(classification layer)之前是否连接(concat)输入和卷积的输出。
-        dropout_ratio=0.1,  # 进入最后分类层(classification layer)之前的 dropout 比例。
-        num_classes=19,  # 分割前景的种类数目。 通常情况下，cityscapes 为19，VOC为21，ADE20k 为150。
-        norm_cfg=dict(type='SyncBN', requires_grad=True),  # 归一化层的配置项。
-        align_corners=False,  # 解码里调整大小(resize)的 align_corners 参数。
-        loss_decode=dict(  # 辅助头(auxiliary head)里的损失函数的配置项。
-            type='CrossEntropyLoss',  # 在分割里使用的损失函数的类别。
-            use_sigmoid=False,  # 在分割里是否使用 sigmoid 激活。
-            loss_weight=0.4)))  # 辅助头里损失的权重。默认设置为0.4。
-train_cfg = dict()  # train_cfg 当前仅是一个占位符。
-test_cfg = dict(mode='whole')  # 测试模式， 选项是 'whole' 和 'sliding'. 'whole': 整张图像全卷积(fully-convolutional)测试。 'sliding': 图像上做滑动裁剪窗口(sliding crop window)。
-dataset_type = 'CityscapesDataset'  # 数据集类型，这将被用来定义数据集。
-data_root = 'data/cityscapes/'  # 数据的根路径。
-img_norm_cfg = dict(  # 图像归一化配置，用来归一化输入的图像。
-    mean=[123.675, 116.28, 103.53],  # 预训练里用于预训练主干网络模型的平均值。
-    std=[58.395, 57.12, 57.375],  # 预训练里用于预训练主干网络模型的标准差。
-    to_rgb=True)  # 预训练里用于预训练主干网络的图像的通道顺序。
-crop_size = (512, 1024)  # 训练时的裁剪大小
-train_pipeline = [  #训练流程
-    dict(type='LoadImageFromFile'),  # 第1个流程，从文件路径里加载图像。
-    dict(type='LoadAnnotations'),  # 第2个流程，对于当前图像，加载它的注释信息。
-    dict(type='Resize',  # 变化图像和其注释大小的数据增广的流程。
-        img_scale=(2048, 1024),  # 图像的最大规模。
-        ratio_range=(0.5, 2.0)), # 数据增广的比例范围。
-    dict(type='RandomCrop',  # 随机裁剪当前图像和其注释大小的数据增广的流程。
-        crop_size=(512, 1024),  # 随机裁剪图像生成 patch 的大小。
-        cat_max_ratio=0.75),  # 单个类别可以填充的最大区域的比例。
-    dict(
-        type='RandomFlip',  # 翻转图像和其注释大小的数据增广的流程。
-        flip_ratio=0.5),  # 翻转图像的概率
-    dict(type='PhotoMetricDistortion'),  # 光学上使用一些方法扭曲当前图像和其注释的数据增广的流程。
-    dict(
-        type='Normalize',  # 归一化当前图像的数据增广的流程。
-        mean=[123.675, 116.28, 103.53],  # 这些键与 img_norm_cfg 一致，因为 img_norm_cfg 被
-        std=[58.395, 57.12, 57.375],  # 用作参数。
-        to_rgb=True),
-    dict(type='Pad',  # 填充当前图像到指定大小的数据增广的流程。
-        size=(512, 1024),  # 填充的图像大小。
-        pad_val=0,  # 图像的填充值。
-        seg_pad_val=255),  # 'gt_semantic_seg'的填充值。
-    dict(type='DefaultFormatBundle'),  # 流程里收集数据的默认格式捆。
-    dict(type='Collect',  # 决定数据里哪些键被传递到分割器里的流程。
-        keys=['img', 'gt_semantic_seg'])
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),  # 第1个流程，从文件路径里加载图像。
-    dict(
-        type='MultiScaleFlipAug',  # 封装测试时数据增广(test time augmentations)。
-        img_scale=(2048, 1024),  # 决定测试时可改变图像的最大规模。用于改变图像大小的流程。
-        flip=False,  # 测试时是否翻转图像。
-        transforms=[
-            dict(type='Resize',  # 使用改变图像大小的数据增广。
-                 keep_ratio=True),  # 是否保持宽和高的比例，这里的图像比例设置将覆盖上面的图像规模大小的设置。
-            dict(type='RandomFlip'),  # 考虑到 RandomFlip 已经被添加到流程里，当 flip=False 时它将不被使用。
-            dict(
-                type='Normalize',  # 归一化配置项，值来自 img_norm_cfg。
-                mean=[123.675, 116.28, 103.53],
-                std=[58.395, 57.12, 57.375],
-                to_rgb=True),
-            dict(type='ImageToTensor', # 将图像转为张量
-                keys=['img']),
-            dict(type='Collect', # 收集测试时必须的键的收集流程。
-                keys=['img'])
-        ])
-]
-data = dict(
-    samples_per_gpu=2,  # 单个 GPU 的 Batch size
-    workers_per_gpu=2,  # 单个 GPU 分配的数据加载线程数
-    train=dict(  # 训练数据集配置
-        type='CityscapesDataset',  # 数据集的类别, 细节参考自 mmseg/datasets/。
-        data_root='data/cityscapes/',  # 数据集的根目录。
-        img_dir='leftImg8bit/train',  # 数据集图像的文件夹。
-        ann_dir='gtFine/train',  # 数据集注释的文件夹。
-        pipeline=[  # 流程， 由之前创建的 train_pipeline 传递进来。
-            dict(type='LoadImageFromFile'),
-            dict(type='LoadAnnotations'),
-            dict(
-                type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
-            dict(type='RandomCrop', crop_size=(512, 1024), cat_max_ratio=0.75),
-            dict(type='RandomFlip', flip_ratio=0.5),
-            dict(type='PhotoMetricDistortion'),
-            dict(
-                type='Normalize',
-                mean=[123.675, 116.28, 103.53],
-                std=[58.395, 57.12, 57.375],
-                to_rgb=True),
-            dict(type='Pad', size=(512, 1024), pad_val=0, seg_pad_val=255),
-            dict(type='DefaultFormatBundle'),
-            dict(type='Collect', keys=['img', 'gt_semantic_seg'])
-        ]),
-    val=dict(  # 验证数据集的配置
-        type='CityscapesDataset',
-        data_root='data/cityscapes/',
-        img_dir='leftImg8bit/val',
-        ann_dir='gtFine/val',
-        pipeline=[  # 由之前创建的 test_pipeline 传递的流程。
-            dict(type='LoadImageFromFile'),
-            dict(
-                type='MultiScaleFlipAug',
-                img_scale=(2048, 1024),
-                flip=False,
-                transforms=[
-                    dict(type='Resize', keep_ratio=True),
-                    dict(type='RandomFlip'),
-                    dict(
-                        type='Normalize',
-                        mean=[123.675, 116.28, 103.53],
-                        std=[58.395, 57.12, 57.375],
-                        to_rgb=True),
-                    dict(type='ImageToTensor', keys=['img']),
-                    dict(type='Collect', keys=['img'])
-                ])
-        ]),
-    test=dict(
-        type='CityscapesDataset',
-        data_root='data/cityscapes/',
-        img_dir='leftImg8bit/val',
-        ann_dir='gtFine/val',
-        pipeline=[
-            dict(type='LoadImageFromFile'),
-            dict(
-                type='MultiScaleFlipAug',
-                img_scale=(2048, 1024),
-                flip=False,
-                transforms=[
-                    dict(type='Resize', keep_ratio=True),
-                    dict(type='RandomFlip'),
-                    dict(
-                        type='Normalize',
-                        mean=[123.675, 116.28, 103.53],
-                        std=[58.395, 57.12, 57.375],
-                        to_rgb=True),
-                    dict(type='ImageToTensor', keys=['img']),
-                    dict(type='Collect', keys=['img'])
-                ])
-        ]))
-log_config = dict(  # 注册日志钩 (register logger hook) 的配置文件。
-    interval=50,  # 打印日志的间隔
-    hooks=[
-        # dict(type='TensorboardLoggerHook')  # 同样支持 Tensorboard 日志
-        dict(type='TextLoggerHook', by_epoch=False)
-    ])
-dist_params = dict(backend='nccl')  # 用于设置分布式训练的参数，端口也同样可被设置。
-log_level = 'INFO'  # 日志的级别。
-load_from = None  # 从一个给定路径里加载模型作为预训练模型，它并不会消耗训练时间。
-resume_from = None  # 从给定路径里恢复检查点(checkpoints)，训练模式将从检查点保存的轮次开始恢复训练。
-workflow = [('train', 1)]  # runner 的工作流程。 [('train', 1)] 意思是只有一个工作流程而且工作流程 'train' 仅执行一次。根据 `runner.max_iters` 工作流程训练模型的迭代轮数为40000次。
-cudnn_benchmark = True  # 是否是使用 cudnn_benchmark 去加速，它对于固定输入大小的可以提高训练速度。
-optimizer = dict(  # 用于构建优化器的配置文件。支持 PyTorch 中的所有优化器，同时它们的参数与PyTorch里的优化器参数一致。
-    type='SGD',  # 优化器种类，更多细节可参考 https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/optimizer/default_constructor.py#L13。
-    lr=0.01,  # 优化器的学习率，参数的使用细节请参照对应的 PyTorch 文档。
-    momentum=0.9,  # 动量 (Momentum)
-    weight_decay=0.0005)  # SGD 的衰减权重 (weight decay)。
-optimizer_config = dict()  # 用于构建优化器钩 (optimizer hook) 的配置文件，执行细节请参考 https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/optimizer.py#L8。
-lr_config = dict(
-    policy='poly',  # 调度流程的策略，同样支持 Step, CosineAnnealing, Cyclic 等. 请从 https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py#L9 参考 LrUpdater 的细节。
-    power=0.9,  # 多项式衰减 (polynomial decay) 的幂。
-    min_lr=0.0001,  # 用来稳定训练的最小学习率。
-    by_epoch=False)  # 是否按照每个 epoch 去算学习率。
-runner = dict(
-    type='IterBasedRunner', # 将使用的 runner 的类别 (例如 IterBasedRunner 或 EpochBasedRunner)。
-    max_iters=40000) # 全部迭代轮数大小，对于 EpochBasedRunner 使用 `max_epochs` 。
-checkpoint_config = dict(  # 设置检查点钩子 (checkpoint hook) 的配置文件。执行时请参考 https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/checkpoint.py。
-    by_epoch=False,  # 是否按照每个 epoch 去算 runner。
-    interval=4000)  # 保存的间隔
-evaluation = dict(  # 构建评估钩 (evaluation hook) 的配置文件。细节请参考 mmseg/core/evaluation/eval_hook.py。
-    interval=4000,  # 评估的间歇点
-    metric='mIoU')  # 评估的指标
-
-
-```
-
-## FAQ
-
-### 忽略基础配置文件里的一些域内容。
-
-有时，您也许会设置 `_delete_=True` 去忽略基础配置文件里的一些域内容。
-您也许可以参照 [mmcv](https://mmcv.readthedocs.io/en/latest/understand_mmcv/config.html#inherit-from-base-config-with-ignored-fields) 来获得一些简单的指导。
-
-在 MMSegmentation 里，例如为了改变 PSPNet 的主干网络的某些内容：
-
-```python
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='MaskRCNN',
-    pretrained='torchvision://resnet50',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(...),
-    auxiliary_head=dict(...))
-```
-
-`ResNet` 和 `HRNet` 使用不同的关键词去构建。
-
-```python
-_base_ = '../pspnet/psp_r50_512x1024_40ki_cityscpaes.py'
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    pretrained='open-mmlab://msra/hrnetv2_w32',
-    backbone=dict(
-        _delete_=True,
-        type='HRNet',
-        norm_cfg=norm_cfg,
-        extra=dict(
-            stage1=dict(
-                num_modules=1,
-                num_branches=1,
-                block='BOTTLENECK',
-                num_blocks=(4, ),
-                num_channels=(64, )),
-            stage2=dict(
-                num_modules=1,
-                num_branches=2,
-                block='BASIC',
-                num_blocks=(4, 4),
-                num_channels=(32, 64)),
-            stage3=dict(
-                num_modules=4,
-                num_branches=3,
-                block='BASIC',
-                num_blocks=(4, 4, 4),
-                num_channels=(32, 64, 128)),
-            stage4=dict(
-                num_modules=3,
-                num_branches=4,
-                block='BASIC',
-                num_blocks=(4, 4, 4, 4),
-                num_channels=(32, 64, 128, 256)))),
-    decode_head=dict(...),
-    auxiliary_head=dict(...))
-```
-
-`_delete_=True` 将用新的键去替换 `backbone` 域内所有老的键。
-
-### 使用配置文件里的中间变量
-
-配置文件里会使用一些中间变量，例如数据集里的 `train_pipeline`/`test_pipeline`。
-需要注意的是，在子配置文件里修改中间变量时，使用者需要再次传递这些变量给对应的域。
-例如，我们想改变在训练或测试时，PSPNet 的多尺度策略 (multi scale strategy)，`train_pipeline`/`test_pipeline` 是我们想要修改的中间变量。
-
-```python
-_base_ = '../pspnet/psp_r50_512x1024_40ki_cityscapes.py'
-crop_size = (512, 1024)
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations'),
-    dict(type='Resize', img_scale=(2048, 1024), ratio_range=(1.0, 2.0)),  # 改成 [1., 2.]
-    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='PhotoMetricDistortion'),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(2048, 1024),
-        img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],  # 改成多尺度测试 (multi scale testing)。
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-data = dict(
-    train=dict(pipeline=train_pipeline),
-    val=dict(pipeline=test_pipeline),
-    test=dict(pipeline=test_pipeline))
-```
-
-我们首先定义新的 `train_pipeline`/`test_pipeline` 然后传递到 `data` 里。
-
-同样的，如果我们想从 `SyncBN` 切换到 `BN` 或者 `MMSyncBN`，我们需要配置文件里的每一个 `norm_cfg`。
-
-```python
-_base_ = '../pspnet/psp_r50_512x1024_40ki_cityscpaes.py'
-norm_cfg = dict(type='BN', requires_grad=True)
-model = dict(
-    backbone=dict(norm_cfg=norm_cfg),
-    decode_head=dict(norm_cfg=norm_cfg),
-    auxiliary_head=dict(norm_cfg=norm_cfg))
-```
diff --git a/docs/zh_cn/tutorials/customize_datasets.md b/docs/zh_cn/tutorials/customize_datasets.md
deleted file mode 100644
index 2de1398e4d..0000000000
--- a/docs/zh_cn/tutorials/customize_datasets.md
+++ /dev/null
@@ -1,209 +0,0 @@
-# 教程 2: 自定义数据集
-
-## 通过重新组织数据来定制数据集
-
-最简单的方法是将您的数据集进行转化，并组织成文件夹的形式。
-
-如下的文件结构就是一个例子。
-
-```none
-├── data
-│   ├── my_dataset
-│   │   ├── img_dir
-│   │   │   ├── train
-│   │   │   │   ├── xxx{img_suffix}
-│   │   │   │   ├── yyy{img_suffix}
-│   │   │   │   ├── zzz{img_suffix}
-│   │   │   ├── val
-│   │   ├── ann_dir
-│   │   │   ├── train
-│   │   │   │   ├── xxx{seg_map_suffix}
-│   │   │   │   ├── yyy{seg_map_suffix}
-│   │   │   │   ├── zzz{seg_map_suffix}
-│   │   │   ├── val
-
-```
-
-一个训练对将由 img_dir/ann_dir 里同样首缀的文件组成。
-
-如果给定 `split` 参数，只有部分在 img_dir/ann_dir 里的文件会被加载。
-我们可以对被包括在 split 文本里的文件指定前缀。
-
-除此以外，一个 split 文本如下所示：
-
-```none
-xxx
-zzz
-```
-
-只有
-
-`data/my_dataset/img_dir/train/xxx{img_suffix}`,
-`data/my_dataset/img_dir/train/zzz{img_suffix}`,
-`data/my_dataset/ann_dir/train/xxx{seg_map_suffix}`,
-`data/my_dataset/ann_dir/train/zzz{seg_map_suffix}` 将被加载。
-
-注意：标注是跟图像同样的形状 (H, W)，其中的像素值的范围是 `[0, num_classes - 1]`。
-您也可以使用 [pillow](https://pillow.readthedocs.io/en/stable/handbook/concepts.html#palette) 的 `'P'` 模式去创建包含颜色的标注。
-
-## 通过混合数据去定制数据集
-
-MMSegmentation 同样支持混合数据集去训练。
-当前它支持拼接 (concat), 重复 (repeat) 和多图混合 (multi-image mix)数据集。
-
-### 重复数据集
-
-我们使用 `RepeatDataset` 作为包装 (wrapper) 去重复数据集。
-例如，假设原始数据集是 `Dataset_A`，为了重复它，配置文件如下：
-
-```python
-dataset_A_train = dict(
-        type='RepeatDataset',
-        times=N,
-        dataset=dict(  # 这是 Dataset_A 数据集的原始配置
-            type='Dataset_A',
-            ...
-            pipeline=train_pipeline
-        )
-    )
-```
-
-### 拼接数据集
-
-有2种方式去拼接数据集。
-
-1. 如果您想拼接的数据集是同样的类型，但有不同的标注文件，
-   您可以按如下操作去拼接数据集的配置文件：
-
-   1. 您也许可以拼接两个标注文件夹 `ann_dir`
-
-      ```python
-      dataset_A_train = dict(
-          type='Dataset_A',
-          img_dir = 'img_dir',
-          ann_dir = ['anno_dir_1', 'anno_dir_2'],
-          pipeline=train_pipeline
-      )
-      ```
-
-   2. 您也可以去拼接两个 `split` 文件列表
-
-      ```python
-      dataset_A_train = dict(
-          type='Dataset_A',
-          img_dir = 'img_dir',
-          ann_dir = 'anno_dir',
-          split = ['split_1.txt', 'split_2.txt'],
-          pipeline=train_pipeline
-      )
-      ```
-
-   3. 您也可以同时拼接 `ann_dir` 文件夹和 `split` 文件列表
-
-      ```python
-      dataset_A_train = dict(
-          type='Dataset_A',
-          img_dir = 'img_dir',
-          ann_dir = ['anno_dir_1', 'anno_dir_2'],
-          split = ['split_1.txt', 'split_2.txt'],
-          pipeline=train_pipeline
-      )
-      ```
-
-      在这样的情况下， `ann_dir_1` 和 `ann_dir_2` 分别对应于 `split_1.txt` 和 `split_2.txt`
-
-2. 如果您想拼接不同的数据集，您可以如下去拼接数据集的配置文件：
-
-   ```python
-   dataset_A_train = dict()
-   dataset_B_train = dict()
-
-   data = dict(
-       imgs_per_gpu=2,
-       workers_per_gpu=2,
-       train = [
-           dataset_A_train,
-           dataset_B_train
-       ],
-       val = dataset_A_val,
-       test = dataset_A_test
-       )
-   ```
-
-一个更复杂的例子如下：分别重复 `Dataset_A` 和 `Dataset_B` N 次和 M 次，然后再去拼接重复后的数据集
-
-```python
-dataset_A_train = dict(
-    type='RepeatDataset',
-    times=N,
-    dataset=dict(
-        type='Dataset_A',
-        ...
-        pipeline=train_pipeline
-    )
-)
-dataset_A_val = dict(
-    ...
-    pipeline=test_pipeline
-)
-dataset_A_test = dict(
-    ...
-    pipeline=test_pipeline
-)
-dataset_B_train = dict(
-    type='RepeatDataset',
-    times=M,
-    dataset=dict(
-        type='Dataset_B',
-        ...
-        pipeline=train_pipeline
-    )
-)
-data = dict(
-    imgs_per_gpu=2,
-    workers_per_gpu=2,
-    train = [
-        dataset_A_train,
-        dataset_B_train
-    ],
-    val = dataset_A_val,
-    test = dataset_A_test
-)
-
-```
-
-### 多图混合集
-
-我们使用 `MultiImageMixDataset` 作为包装(wrapper)去混合多个数据集的图片。
-`MultiImageMixDataset`可以被类似mosaic和mixup的多图混合数据増广使用。
-
-`MultiImageMixDataset`与`Mosaic`数据増广一起使用的例子：
-
-```python
-train_pipeline = [
-    dict(type='RandomMosaic', prob=1),
-    dict(type='Resize', img_scale=(1024, 512), keep_ratio=True),
-    dict(type='RandomFlip', prob=0.5),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
-]
-
-train_dataset = dict(
-    type='MultiImageMixDataset',
-    dataset=dict(
-        classes=classes,
-        palette=palette,
-        type=dataset_type,
-        reduce_zero_label=False,
-        img_dir=data_root + "images/train",
-        ann_dir=data_root + "annotations/train",
-        pipeline=[
-            dict(type='LoadImageFromFile'),
-            dict(type='LoadAnnotations'),
-        ]
-    ),
-    pipeline=train_pipeline
-)
-
-```
diff --git a/docs/zh_cn/tutorials/customize_models.md b/docs/zh_cn/tutorials/customize_models.md
deleted file mode 100644
index 8bfbb55ecc..0000000000
--- a/docs/zh_cn/tutorials/customize_models.md
+++ /dev/null
@@ -1,230 +0,0 @@
-# 教程 4: 自定义模型
-
-## 自定义优化器 (optimizer)
-
-假设您想增加一个新的叫 `MyOptimizer` 的优化器，它的参数分别为 `a`, `b`, 和 `c`。
-您首先需要在一个文件里实现这个新的优化器，例如在 `mmseg/core/optimizer/my_optimizer.py` 里面：
-
-```python
-from mmcv.runner import OPTIMIZERS
-from torch.optim import Optimizer
-
-
-@OPTIMIZERS.register_module
-class MyOptimizer(Optimizer):
-
-    def __init__(self, a, b, c)
-
-```
-
-然后增加这个模块到 `mmseg/core/optimizer/__init__.py` 里面，这样注册器 (registry) 将会发现这个新的模块并添加它：
-
-```python
-from .my_optimizer import MyOptimizer
-```
-
-之后您可以在配置文件的 `optimizer` 域里使用 `MyOptimizer`，
-如下所示，在配置文件里，优化器被 `optimizer` 域所定义：
-
-```python
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-```
-
-为了使用您自己的优化器，域可以被修改为：
-
-```python
-optimizer = dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)
-```
-
-我们已经支持了 PyTorch 自带的全部优化器，唯一修改的地方是在配置文件里的 `optimizer` 域。例如，如果您想使用 `ADAM`，尽管数值表现会掉点，还是可以如下修改：
-
-```python
-optimizer = dict(type='Adam', lr=0.0003, weight_decay=0.0001)
-```
-
-使用者可以直接按照 PyTorch [文档教程](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim) 去设置参数。
-
-## 定制优化器的构造器 (optimizer constructor)
-
-对于优化，一些模型可能会有一些特别定义的参数，例如批归一化 (BatchNorm) 层里面的权重衰减 (weight decay)。
-使用者可以通过定制优化器的构造器来微调这些细粒度的优化器参数。
-
-```python
-from mmcv.utils import build_from_cfg
-
-from mmcv.runner import OPTIMIZER_BUILDERS
-from .cocktail_optimizer import CocktailOptimizer
-
-
-@OPTIMIZER_BUILDERS.register_module
-class CocktailOptimizerConstructor(object):
-
-    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
-
-    def __call__(self, model):
-
-        return my_optimizer
-
-```
-
-## 开发和增加新的组件（Module）
-
-MMSegmentation 里主要有2种组件：
-
-- 主干网络 (backbone): 通常是卷积网络的堆叠，来做特征提取，例如 ResNet, HRNet
-- 解码头 (decoder head): 用于语义分割图的解码的组件（得到分割结果）
-
-### 添加新的主干网络
-
-这里我们以 MobileNet 为例，展示如何增加新的主干组件：
-
-1. 创建一个新的文件 `mmseg/models/backbones/mobilenet.py`
-
-```python
-import torch.nn as nn
-
-from ..registry import BACKBONES
-
-
-@BACKBONES.register_module
-class MobileNet(nn.Module):
-
-    def __init__(self, arg1, arg2):
-        pass
-
-    def forward(self, x):  # should return a tuple
-        pass
-
-    def init_weights(self, pretrained=None):
-        pass
-```
-
-2. 在 `mmseg/models/backbones/__init__.py` 里面导入模块
-
-```python
-from .mobilenet import MobileNet
-```
-
-3. 在您的配置文件里使用它
-
-```python
-model = dict(
-    ...
-    backbone=dict(
-        type='MobileNet',
-        arg1=xxx,
-        arg2=xxx),
-    ...
-```
-
-### 增加新的解码头 (decoder head)组件
-
-在 MMSegmentation 里面，对于所有的分割头，我们提供一个基类解码头 [BaseDecodeHead](https://github.com/open-mmlab/mmsegmentation/blob/master/mmseg/models/decode_heads/decode_head.py) 。
-所有新建的解码头都应该继承它。这里我们以 [PSPNet](https://arxiv.org/abs/1612.01105) 为例，
-展示如何开发和增加一个新的解码头组件：
-
-首先，在 `mmseg/models/decode_heads/psp_head.py` 里添加一个新的解码头。
-PSPNet 中实现了一个语义分割的解码头。为了实现一个解码头，我们只需要在新构造的解码头中实现如下的3个函数：
-
-```python
-@HEADS.register_module()
-class PSPHead(BaseDecodeHead):
-
-    def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
-        super(PSPHead, self).__init__(**kwargs)
-
-    def init_weights(self):
-
-    def forward(self, inputs):
-
-```
-
-接着，使用者需要在 `mmseg/models/decode_heads/__init__.py` 里面添加这个模块，这样对应的注册器 (registry) 可以查找并加载它们。
-
-PSPNet的配置文件如下所示：
-
-```python
-norm_cfg = dict(type='SyncBN', requires_grad=True)
-model = dict(
-    type='EncoderDecoder',
-    pretrained='pretrain_model/resnet50_v1c_trick-2cccc1ad.pth',
-    backbone=dict(
-        type='ResNetV1c',
-        depth=50,
-        num_stages=4,
-        out_indices=(0, 1, 2, 3),
-        dilations=(1, 1, 2, 4),
-        strides=(1, 2, 1, 1),
-        norm_cfg=norm_cfg,
-        norm_eval=False,
-        style='pytorch',
-        contract_dilation=True),
-    decode_head=dict(
-        type='PSPHead',
-        in_channels=2048,
-        in_index=3,
-        channels=512,
-        pool_scales=(1, 2, 3, 6),
-        dropout_ratio=0.1,
-        num_classes=19,
-        norm_cfg=norm_cfg,
-        align_corners=False,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
-
-```
-
-### 增加新的损失函数
-
-假设您想添加一个新的损失函数 `MyLoss` 到语义分割解码器里。
-为了添加一个新的损失函数，使用者需要在 `mmseg/models/losses/my_loss.py` 里面去实现它。
-`weighted_loss` 可以对计算损失时的每个样本做加权。
-
-```python
-import torch
-import torch.nn as nn
-
-from ..builder import LOSSES
-from .utils import weighted_loss
-
-@weighted_loss
-def my_loss(pred, target):
-    assert pred.size() == target.size() and target.numel() > 0
-    loss = torch.abs(pred - target)
-    return loss
-
-@LOSSES.register_module
-class MyLoss(nn.Module):
-
-    def __init__(self, reduction='mean', loss_weight=1.0):
-        super(MyLoss, self).__init__()
-        self.reduction = reduction
-        self.loss_weight = loss_weight
-
-    def forward(self,
-                pred,
-                target,
-                weight=None,
-                avg_factor=None,
-                reduction_override=None):
-        assert reduction_override in (None, 'none', 'mean', 'sum')
-        reduction = (
-            reduction_override if reduction_override else self.reduction)
-        loss = self.loss_weight * my_loss(
-            pred, target, weight, reduction=reduction, avg_factor=avg_factor)
-        return loss
-```
-
-然后使用者需要在 `mmseg/models/losses/__init__.py` 里面添加它：
-
-```python
-from .my_loss import MyLoss, my_loss
-
-```
-
-为了使用它，修改 `loss_xxx` 域。之后您需要在解码头组件里修改 `loss_decode` 域。
-`loss_weight` 可以被用来对不同的损失函数做加权。
-
-```python
-loss_decode=dict(type='MyLoss', loss_weight=1.0))
-```
diff --git a/docs/zh_cn/tutorials/customize_runtime.md b/docs/zh_cn/tutorials/customize_runtime.md
deleted file mode 100644
index 654d3f7167..0000000000
--- a/docs/zh_cn/tutorials/customize_runtime.md
+++ /dev/null
@@ -1,248 +0,0 @@
-# 教程 6: 自定义运行设定
-
-## 自定义优化设定
-
-### 自定义 PyTorch 支持的优化器
-
-我们已经支持 PyTorch 自带的所有优化器，唯一需要修改的地方是在配置文件里的 `optimizer` 域里面。
-例如，如果您想使用 `ADAM` (注意如下操作可能会让模型表现下降)，可以使用如下修改：
-
-```python
-optimizer = dict(type='Adam', lr=0.0003, weight_decay=0.0001)
-```
-
-为了修改模型的学习率，使用者仅需要修改配置文件里 optimizer 的 `lr` 即可。
-使用者可以参照 PyTorch 的 [API 文档](https://pytorch.org/docs/stable/optim.html?highlight=optim#module-torch.optim)
-直接设置参数。
-
-### 自定义自己实现的优化器
-
-#### 1. 定义一个新的优化器
-
-一个自定义的优化器可以按照如下去定义：
-
-假如您想增加一个叫做 `MyOptimizer` 的优化器，它的参数分别有 `a`, `b`, 和 `c`。
-您需要创建一个叫 `mmseg/core/optimizer` 的新文件夹。
-然后再在文件，即  `mmseg/core/optimizer/my_optimizer.py` 里面去实现这个新优化器：
-
-```python
-from .registry import OPTIMIZERS
-from torch.optim import Optimizer
-
-
-@OPTIMIZERS.register_module()
-class MyOptimizer(Optimizer):
-
-    def __init__(self, a, b, c)
-
-```
-
-#### 2. 增加优化器到注册表 (registry)
-
-为了让上述定义的模块被框架发现，首先这个模块应该被导入到主命名空间 (main namespace) 里。
-有两种方式可以实现它。
-
-- 修改 `mmseg/core/optimizer/__init__.py` 来导入它
-
-  新的被定义的模块应该被导入到 `mmseg/core/optimizer/__init__.py` 这样注册表将会发现新的模块并添加它
-
-```python
-from .my_optimizer import MyOptimizer
-```
-
-- 在配置文件里使用 `custom_imports` 去手动导入它
-
-```python
-custom_imports = dict(imports=['mmseg.core.optimizer.my_optimizer'], allow_failed_imports=False)
-```
-
-`mmseg.core.optimizer.my_optimizer` 模块将会在程序运行的开始被导入，并且 `MyOptimizer` 类将会自动注册。
-需要注意只有包含 `MyOptimizer`  类的包 (package) 应当被导入。
-而 `mmseg.core.optimizer.my_optimizer.MyOptimizer` **不能** 被直接导入。
-
-事实上，使用者完全可以用另一个按这样导入方法的文件夹结构，只要模块的根路径已经被添加到 `PYTHONPATH` 里面。
-
-#### 3. 在配置文件里定义优化器
-
-之后您可以在配置文件的 `optimizer` 域里面使用 `MyOptimizer`
-在配置文件里，优化器被定义在 `optimizer` 域里，如下所示：
-
-```python
-optimizer = dict(type='SGD', lr=0.02, momentum=0.9, weight_decay=0.0001)
-```
-
-为了使用您自己的优化器，这个域可以被改成：
-
-```python
-optimizer = dict(type='MyOptimizer', a=a_value, b=b_value, c=c_value)
-```
-
-### 自定义优化器的构造器 (constructor)
-
-有些模型可能需要在优化器里有一些特别参数的设置，例如 批归一化层 (BatchNorm layers) 的 权重衰减 (weight decay)。
-使用者可以通过自定义优化器的构造器去微调这些细粒度参数。
-
-```python
-from mmcv.utils import build_from_cfg
-
-from mmcv.runner.optimizer import OPTIMIZER_BUILDERS, OPTIMIZERS
-from mmseg.utils import get_root_logger
-from .my_optimizer import MyOptimizer
-
-
-@OPTIMIZER_BUILDERS.register_module()
-class MyOptimizerConstructor(object):
-
-    def __init__(self, optim_wrapper_cfg, paramwise_cfg=None):
-
-    def __call__(self, model):
-
-        return my_optimizer
-
-```
-
-默认的优化器构造器的实现可以参照 [这里](https://github.com/open-mmlab/mmcv/blob/9ecd6b0d5ff9d2172c49a182eaa669e9f27bb8e7/mmcv/runner/optimizer/default_constructor.py#L11) ，它也可以被用作新的优化器构造器的模板。
-
-### 额外的设置
-
-优化器没有实现的一些技巧应该通过优化器构造器 (optimizer constructor) 或者钩子 (hook) 去实现，如设置基于参数的学习率 (parameter-wise learning rates)。我们列出一些常见的设置，它们可以稳定或加速模型的训练。
-如果您有更多的设置，欢迎在 PR 和 issue 里面提交。
-
-- __使用梯度截断 (gradient clip) 去稳定训练__:
-
-  一些模型需要梯度截断去稳定训练过程，如下所示
-
-  ```python
-  optimizer_config = dict(
-      _delete_=True, grad_clip=dict(max_norm=35, norm_type=2))
-  ```
-
-  如果您的配置继承自已经设置了  `optimizer_config` 的基础配置 (base config)，您可能需要 `_delete_=True` 来重写那些不需要的设置。更多细节请参照 [配置文件文档](https://mmsegmentation.readthedocs.io/en/latest/config.html) 。
-
-- __使用动量计划表 (momentum schedule) 去加速模型收敛__:
-
-  我们支持动量计划表去让模型基于学习率修改动量，这样可能让模型收敛地更快。
-  动量计划表经常和学习率计划表 (LR scheduler) 一起使用，例如如下配置文件就在 3D 检测里经常使用以加速收敛。
-  更多细节请参考 [CyclicLrUpdater](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/lr_updater.py#L327) 和 [CyclicMomentumUpdater](https://github.com/open-mmlab/mmcv/blob/f48241a65aebfe07db122e9db320c31b685dc674/mmcv/runner/hooks/momentum_updater.py#L130) 的实现。
-
-  ```python
-  lr_config = dict(
-      policy='cyclic',
-      target_ratio=(10, 1e-4),
-      cyclic_times=1,
-      step_ratio_up=0.4,
-  )
-  momentum_config = dict(
-      policy='cyclic',
-      target_ratio=(0.85 / 0.95, 1),
-      cyclic_times=1,
-      step_ratio_up=0.4,
-  )
-  ```
-
-## 自定义训练计划表
-
-我们根据默认的训练迭代步数 40k/80k 来设置学习率，这在 MMCV 里叫做 [`PolyLrUpdaterHook`](https://github.com/open-mmlab/mmcv/blob/826d3a7b68596c824fa1e2cb89b6ac274f52179c/mmcv/runner/hooks/lr_updater.py#L196) 。
-我们也支持许多其他的学习率计划表：[这里](https://github.com/open-mmlab/mmcv/blob/master/mmcv/runner/hooks/lr_updater.py) ，例如 `CosineAnnealing` 和 `Poly` 计划表。下面是一些例子：
-
-- 步计划表 Step schedule:
-
-  ```python
-  lr_config = dict(policy='step', step=[9, 10])
-  ```
-
-- 余弦退火计划表 ConsineAnnealing schedule:
-
-  ```python
-  lr_config = dict(
-      policy='CosineAnnealing',
-      warmup='linear',
-      warmup_iters=1000,
-      warmup_ratio=1.0 / 10,
-      min_lr_ratio=1e-5)
-  ```
-
-## 自定义工作流 (workflow)
-
-工作流是一个专门定义运行顺序和轮数 (running order and epochs) 的列表 (phase, epochs)。
-默认情况下它设置成：
-
-```python
-workflow = [('train', 1)]
-```
-
-意思是训练是跑 1 个 epoch。有时候使用者可能想检查模型在验证集上的一些指标（如 损失 loss，精确性 accuracy），我们可以这样设置工作流：
-
-```python
-[('train', 1), ('val', 1)]
-```
-
-于是 1 个 epoch 训练，1 个 epoch 验证将交替运行。
-
-**注意**:
-
-1. 模型的参数在验证的阶段不会被自动更新
-2. 配置文件里的关键词 `total_epochs` 仅控制训练的 epochs 数目，而不会影响验证时的工作流
-3. 工作流 `[('train', 1), ('val', 1)]` 和 `[('train', 1)]` 将不会改变 `EvalHook` 的行为，因为 `EvalHook` 被 `after_train_epoch`
-   调用而且验证的工作流仅仅影响通过调用 `after_val_epoch` 的钩子 (hooks)。因此， `[('train', 1), ('val', 1)]` 和 `[('train', 1)]`
-   的区别仅在于 runner 将在每次训练 epoch 结束后计算在验证集上的损失
-
-## 自定义钩 (hooks)
-
-### 使用 MMCV 实现的钩子 (hooks)
-
-如果钩子已经在 MMCV 里被实现，如下所示，您可以直接修改配置文件来使用钩子：
-
-```python
-custom_hooks = [
-    dict(type='MyHook', a=a_value, b=b_value, priority='NORMAL')
-]
-```
-
-### 修改默认的运行时间钩子 (runtime hooks)
-
-以下的常用的钩子没有被 `custom_hooks` 注册：
-
-- log_config
-- checkpoint_config
-- evaluation
-- lr_config
-- optimizer_config
-- momentum_config
-
-在这些钩子里，只有 logger hook 有 `VERY_LOW` 优先级，其他的优先级都是 `NORMAL`。
-上述提及的教程已经包括了如何修改 `optimizer_config`，`momentum_config` 和 `lr_config`。
-这里我们展示我们如何处理 `log_config`， `checkpoint_config` 和 `evaluation`。
-
-#### 检查点配置文件 (Checkpoint config)
-
-MMCV runner 将使用 `checkpoint_config` 去初始化 [`CheckpointHook`](https://github.com/open-mmlab/mmcv/blob/9ecd6b0d5ff9d2172c49a182eaa669e9f27bb8e7/mmcv/runner/hooks/checkpoint.py#L9).
-
-```python
-checkpoint_config = dict(interval=1)
-```
-
-使用者可以设置 `max_keep_ckpts` 来仅保存一小部分检查点或者通过 `save_optimizer` 来决定是否保存优化器的状态字典 (state dict of optimizer)。 更多使用参数的细节请参考 [这里](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.CheckpointHook) 。
-
-#### 日志配置文件 (Log config)
-
-`log_config` 包裹了许多日志钩 (logger hooks) 而且能去设置间隔 (intervals)。现在 MMCV 支持 `WandbLoggerHook`， `MlflowLoggerHook` 和 `TensorboardLoggerHook`。
-详细的使用请参照 [文档](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.LoggerHook) 。
-
-```python
-log_config = dict(
-    interval=50,
-    hooks=[
-        dict(type='TextLoggerHook'),
-        dict(type='TensorboardLoggerHook')
-    ])
-```
-
-#### 评估配置文件 (Evaluation config)
-
-`evaluation` 的配置文件将被用来初始化 [`EvalHook`](https://github.com/open-mmlab/mmsegmentation/blob/e3f6f655d69b777341aec2fe8829871cc0beadcb/mmseg/core/evaluation/eval_hooks.py#L7) 。
-除了 `interval` 键，其他的像 `metric` 这样的参数将被传递给 `dataset.evaluate()` 。
-
-```python
-evaluation = dict(interval=1, metric='mIoU')
-```
diff --git a/docs/zh_cn/tutorials/data_pipeline.md b/docs/zh_cn/tutorials/data_pipeline.md
deleted file mode 100644
index 119ae98a5e..0000000000
--- a/docs/zh_cn/tutorials/data_pipeline.md
+++ /dev/null
@@ -1,166 +0,0 @@
-# 教程 3: 自定义数据流程
-
-## 数据流程的设计
-
-按照通常的惯例，我们使用 `Dataset` 和 `DataLoader` 做多线程的数据加载。`Dataset` 返回一个数据内容的字典，里面对应于模型前传方法的各个参数。
-因为在语义分割中，输入的图像数据具有不同的大小，我们在 MMCV 里引入一个新的 `DataContainer` 类别去帮助收集和分发不同大小的输入数据。
-
-更多细节，请查看[这里](https://github.com/open-mmlab/mmcv/blob/master/mmcv/parallel/data_container.py) 。
-
-数据的准备流程和数据集是解耦的。通常一个数据集定义了如何处理标注数据（annotations）信息，而一个数据流程定义了准备一个数据字典的所有步骤。一个流程包括了一系列操作，每个操作里都把一个字典作为输入，然后再输出一个新的字典给下一个变换操作。
-
-这些操作可分为数据加载 (data loading)，预处理 (pre-processing)，格式变化 (formatting) 和测试时数据增强 (test-time augmentation)。
-
-下面的例子就是 PSPNet 的一个流程：
-
-```python
-img_norm_cfg = dict(
-    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-crop_size = (512, 1024)
-train_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(type='LoadAnnotations'),
-    dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
-    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
-    dict(type='RandomFlip', flip_ratio=0.5),
-    dict(type='PhotoMetricDistortion'),
-    dict(type='Normalize', **img_norm_cfg),
-    dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
-    dict(type='DefaultFormatBundle'),
-    dict(type='Collect', keys=['img', 'gt_semantic_seg']),
-]
-test_pipeline = [
-    dict(type='LoadImageFromFile'),
-    dict(
-        type='MultiScaleFlipAug',
-        img_scale=(2048, 1024),
-        # img_ratios=[0.5, 0.75, 1.0, 1.25, 1.5, 1.75],
-        flip=False,
-        transforms=[
-            dict(type='Resize', keep_ratio=True),
-            dict(type='RandomFlip'),
-            dict(type='Normalize', **img_norm_cfg),
-            dict(type='ImageToTensor', keys=['img']),
-            dict(type='Collect', keys=['img']),
-        ])
-]
-```
-
-对于每个操作，我们列出它添加、更新、移除的相关字典域 (dict fields)：
-
-### 数据加载 Data loading
-
-`LoadImageFromFile`
-
-- 增加: img, img_shape, ori_shape
-
-`LoadAnnotations`
-
-- 增加: gt_semantic_seg, seg_fields
-
-### 预处理 Pre-processing
-
-`Resize`
-
-- 增加: scale, scale_idx, pad_shape, scale_factor, keep_ratio
-- 更新: img, img_shape, \*seg_fields
-
-`RandomFlip`
-
-- 增加: flip
-- 更新: img, \*seg_fields
-
-`Pad`
-
-- 增加: pad_fixed_size, pad_size_divisor
-- 更新: img, pad_shape, \*seg_fields
-
-`RandomCrop`
-
-- 更新: img, pad_shape, \*seg_fields
-
-`Normalize`
-
-- 增加: img_norm_cfg
-- 更新: img
-
-`SegRescale`
-
-- 更新: gt_semantic_seg
-
-`PhotoMetricDistortion`
-
-- 更新: img
-
-### 格式 Formatting
-
-`ToTensor`
-
-- 更新: 由 `keys` 指定
-
-`ImageToTensor`
-
-- 更新: 由 `keys` 指定
-
-`Transpose`
-
-- 更新: 由 `keys` 指定
-
-`ToDataContainer`
-
-- 更新: 由 `keys` 指定
-
-`DefaultFormatBundle`
-
-- 更新: img, gt_semantic_seg
-
-`Collect`
-
-- 增加: img_meta (the keys of img_meta is specified by `meta_keys`)
-- 移除: all other keys except for those specified by `keys`
-
-### 测试时数据增强 Test time augmentation
-
-`MultiScaleFlipAug`
-
-## 拓展和使用自定义的流程
-
-1. 在任何一个文件里写一个新的流程，例如 `my_pipeline.py`，它以一个字典作为输入并且输出一个字典
-
-   ```python
-   from mmseg.datasets import PIPELINES
-
-   @PIPELINES.register_module()
-   class MyTransform:
-
-       def __call__(self, results):
-           results['dummy'] = True
-           return results
-   ```
-
-2. 导入一个新类
-
-   ```python
-   from .my_pipeline import MyTransform
-   ```
-
-3. 在配置文件里使用它
-
-   ```python
-   img_norm_cfg = dict(
-       mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
-   crop_size = (512, 1024)
-   train_pipeline = [
-       dict(type='LoadImageFromFile'),
-       dict(type='LoadAnnotations'),
-       dict(type='Resize', img_scale=(2048, 1024), ratio_range=(0.5, 2.0)),
-       dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
-       dict(type='RandomFlip', flip_ratio=0.5),
-       dict(type='PhotoMetricDistortion'),
-       dict(type='Normalize', **img_norm_cfg),
-       dict(type='Pad', size=crop_size, pad_val=0, seg_pad_val=255),
-       dict(type='MyTransform'),
-       dict(type='DefaultFormatBundle'),
-       dict(type='Collect', keys=['img', 'gt_semantic_seg']),
-   ]
-   ```
diff --git a/docs/zh_cn/tutorials/index.rst b/docs/zh_cn/tutorials/index.rst
deleted file mode 100644
index e1a67a8b44..0000000000
--- a/docs/zh_cn/tutorials/index.rst
+++ /dev/null
@@ -1,9 +0,0 @@
-.. toctree::
-   :maxdepth: 2
-
-   config.md
-   customize_datasets.md
-   data_pipeline.md
-   customize_models.md
-   training_tricks.md
-   customize_runtime.md
diff --git a/docs/zh_cn/tutorials/training_tricks.md b/docs/zh_cn/tutorials/training_tricks.md
deleted file mode 100644
index f67759aa4f..0000000000
--- a/docs/zh_cn/tutorials/training_tricks.md
+++ /dev/null
@@ -1,95 +0,0 @@
-# 教程 5: 训练技巧
-
-MMSegmentation 支持如下训练技巧：
-
-## 主干网络和解码头组件使用不同的学习率 (Learning Rate, LR)
-
-在语义分割里，一些方法会让解码头组件的学习率大于主干网络的学习率，这样可以获得更好的表现或更快的收敛。
-
-在 MMSegmentation 里面，您也可以在配置文件里添加如下行来让解码头组件的学习率是主干组件的10倍。
-
-```python
-optimizer=dict(
-    paramwise_cfg = dict(
-        custom_keys={
-            'head': dict(lr_mult=10.)}))
-```
-
-通过这种修改，任何被分组到 `'head'` 的参数的学习率都将乘以10。您也可以参照 [MMCV 文档](https://mmcv.readthedocs.io/en/latest/api.html#mmcv.runner.DefaultOptimizerConstructor)  获取更详细的信息。
-
-## 在线难样本挖掘 (Online Hard Example Mining, OHEM)
-
-对于训练时采样，我们在 [这里](https://github.com/open-mmlab/mmsegmentation/tree/master/mmseg/core/seg/sampler) 做了像素采样器。
-如下例子是使用 PSPNet 训练并采用 OHEM 策略的配置：
-
-```python
-_base_ = './pspnet_r50-d8_512x1024_40k_cityscapes.py'
-model=dict(
-    decode_head=dict(
-        sampler=dict(type='OHEMPixelSampler', thresh=0.7, min_kept=100000)) )
-```
-
-通过这种方式，只有置信分数在0.7以下的像素值点会被拿来训练。在训练时我们至少要保留100000个像素值点。如果 `thresh` 并未被指定，前 `min_kept`
-个损失的像素值点才会被选择。
-
-## 类别平衡损失 (Class Balanced Loss)
-
-对于不平衡类别分布的数据集，您也许可以改变每个类别的损失权重。这里以 cityscapes 数据集为例：
-
-```python
-_base_ = './pspnet_r50-d8_512x1024_40k_cityscapes.py'
-model=dict(
-    decode_head=dict(
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0,
-            # DeepLab 对 cityscapes 使用这种权重
-            class_weight=[0.8373, 0.9180, 0.8660, 1.0345, 1.0166, 0.9969, 0.9754,
-                        1.0489, 0.8786, 1.0023, 0.9539, 0.9843, 1.1116, 0.9037,
-                        1.0865, 1.0955, 1.0865, 1.1529, 1.0507])))
-```
-
-`class_weight` 将被作为 `weight` 参数，传递给 `CrossEntropyLoss`。详细信息请参照 [PyTorch 文档](https://pytorch.org/docs/stable/nn.html?highlight=crossentropy#torch.nn.CrossEntropyLoss) 。
-
-## 同时使用多种损失函数 (Multiple Losses)
-
-对于训练时损失函数的计算，我们目前支持多个损失函数同时使用。 以 `unet` 使用 `DRIVE` 数据集训练为例，
-使用 `CrossEntropyLoss` 和 `DiceLoss` 的 `1:3` 的加权和作为损失函数。配置文件写为:
-
-```python
-_base_ = './fcn_unet_s5-d16_64x64_40k_drive.py'
-model = dict(
-    decode_head=dict(loss_decode=[dict(type='CrossEntropyLoss', loss_name='loss_ce', loss_weight=1.0),
-            dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)]),
-    auxiliary_head=dict(loss_decode=[dict(type='CrossEntropyLoss', loss_name='loss_ce',loss_weight=1.0),
-            dict(type='DiceLoss', loss_name='loss_dice', loss_weight=3.0)]),
-    )
-```
-
-通过这种方式，确定训练过程中损失函数的权重 `loss_weight` 和在训练日志里的名字 `loss_name`。
-
-注意： `loss_name` 的名字必须带有 `loss_` 前缀，这样它才能被包括在反传的图里。
-
-## 在损失函数中忽略特定的 label 类别
-
-默认设置 `avg_non_ignore=False`， 即每个像素都用来计算损失函数。尽管其中的一些像素属于需要被忽略的类别。
-
-对于训练时损失函数的计算，我们目前支持使用 `avg_non_ignore` 和 `ignore_index` 来忽略 label 特定的类别。 这样损失函数将只在非忽略类别像素中求平均值，会获得更好的表现。这里是[相关 PR](https://github.com/open-mmlab/mmsegmentation/pull/1409)。以 `unet` 使用 `Cityscapes` 数据集训练为例，
-在计算损失函数时，忽略 label 为0的背景，并且仅在不被忽略的像素上计算均值。配置文件写为:
-
-```python
-_base_ = './fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes.py'
-model = dict(
-    decode_head=dict(
-        ignore_index=0,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, avg_non_ignore=True),
-    auxiliary_head=dict(
-        ignore_index=0,
-        loss_decode=dict(
-            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0, avg_non_ignore=True)),
-    ))
-```
-
-通过这种方式，确定训练过程中损失函数的权重 `loss_weight` 和在训练日志里的名字 `loss_name`。
-
-注意： `loss_name` 的名字必须带有 `loss_` 前缀，这样它才能被包括在反传的图里。
diff --git a/docs/zh_cn/useful_tools.md b/docs/zh_cn/useful_tools.md
deleted file mode 100644
index d6a498696f..0000000000
--- a/docs/zh_cn/useful_tools.md
+++ /dev/null
@@ -1,368 +0,0 @@
-## 常用工具
-
-除了训练和测试的脚本，我们在 `tools/` 文件夹路径下还提供许多有用的工具。
-
-### 计算参数量（params）和计算量（ FLOPs） (试验性)
-
-我们基于 [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch)
-提供了一个用于计算给定模型参数量和计算量的脚本。
-
-```shell
-python tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
-```
-
-您将得到如下的结果：
-
-```none
-==============================
-Input shape: (3, 2048, 1024)
-Flops: 1429.68 GMac
-Params: 48.98 M
-==============================
-```
-
-**注意**: 这个工具仍然是试验性的，我们无法保证数字是正确的。您可以拿这些结果做简单的实验的对照，在写技术文档报告或者论文前您需要再次确认一下。
-
-(1) 计算量与输入的形状有关，而参数量与输入的形状无关，默认的输入形状是 (1, 3, 1280, 800)；
-(2) 一些运算操作，如 GN 和其他定制的运算操作没有加入到计算量的计算中。
-
-### 发布模型
-
-在您上传一个模型到云服务器之前，您需要做以下几步：
-(1) 将模型权重转成 CPU 张量；
-(2) 删除记录优化器状态 (optimizer states)的相关信息；
-(3) 计算检查点文件 (checkpoint file) 的哈希编码（hash id）并且将哈希编码加到文件名中。
-
-```shell
-python tools/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
-```
-
-例如，
-
-```shell
-python tools/publish_model.py work_dirs/pspnet/latest.pth psp_r50_hszhao_200ep.pth
-```
-
-最终输出文件将是 `psp_r50_512x1024_40ki_cityscapes-{hash id}.pth`。
-
-### 导出 ONNX (试验性)
-
-我们提供了一个脚本来导出模型到 [ONNX](https://github.com/onnx/onnx) 格式。被转换的模型可以通过工具 [Netron](https://github.com/lutzroeder/netron)
-来可视化。除此以外，我们同样支持对 PyTorch 和 ONNX 模型的输出结果做对比。
-
-```bash
-python tools/pytorch2onnx.py \
-    ${CONFIG_FILE} \
-    --checkpoint ${CHECKPOINT_FILE} \
-    --output-file ${ONNX_FILE} \
-    --input-img ${INPUT_IMG} \
-    --shape ${INPUT_SHAPE} \
-    --rescale-shape ${RESCALE_SHAPE} \
-    --show \
-    --verify \
-    --dynamic-export \
-    --cfg-options \
-      model.test_cfg.mode="whole"
-```
-
-各个参数的描述:
-
-- `config` : 模型配置文件的路径
-- `--checkpoint` : 模型检查点文件的路径
-- `--output-file`: 输出的 ONNX 模型的路径。如果没有专门指定，它默认是 `tmp.onnx`
-- `--input-img` : 用来转换和可视化的一张输入图像的路径
-- `--shape`: 模型的输入张量的高和宽。如果没有专门指定，它将被设置成 `test_pipeline` 的 `img_scale`
-- `--rescale-shape`: 改变输出的形状。设置这个值来避免 OOM，它仅在 `slide` 模式下可以用
-- `--show`: 是否打印输出模型的结构。如果没有被专门指定，它将被设置成 `False`
-- `--verify`: 是否验证一个输出模型的正确性 (correctness)。如果没有被专门指定，它将被设置成 `False`
-- `--dynamic-export`: 是否导出形状变化的输入与输出的 ONNX 模型。如果没有被专门指定，它将被设置成 `False`
-- `--cfg-options`: 更新配置选项
-
-**注意**: 这个工具仍然是试验性的，目前一些自定义操作还没有被支持
-
-### 评估 ONNX 模型
-
-我们提供 `tools/deploy_test.py` 去评估不同后端的 ONNX 模型。
-
-#### 先决条件
-
-- 安装 onnx 和 onnxruntime-gpu
-
-  ```shell
-  pip install onnx onnxruntime-gpu
-  ```
-
-- 参考 [如何在 MMCV 里构建 tensorrt 插件](https://mmcv.readthedocs.io/en/latest/tensorrt_plugin.html#how-to-build-tensorrt-plugins-in-mmcv) 安装TensorRT (可选)
-
-#### 使用方法
-
-```bash
-python tools/deploy_test.py \
-    ${CONFIG_FILE} \
-    ${MODEL_FILE} \
-    ${BACKEND} \
-    --out ${OUTPUT_FILE} \
-    --eval ${EVALUATION_METRICS} \
-    --show \
-    --show-dir ${SHOW_DIRECTORY} \
-    --cfg-options ${CFG_OPTIONS} \
-    --eval-options ${EVALUATION_OPTIONS} \
-    --opacity ${OPACITY} \
-```
-
-各个参数的描述:
-
-- `config`: 模型配置文件的路径
-- `model`: 被转换的模型文件的路径
-- `backend`: 推理的后端，可选项：`onnxruntime`， `tensorrt`
-- `--out`: 输出结果成 pickle 格式文件的路径
-- `--format-only` : 不评估直接给输出结果的格式。通常用在当您想把结果输出成一些测试服务器需要的特定格式时。如果没有被专门指定，它将被设置成 `False`。 注意这个参数是用 `--eval` 来 **手动添加**
-- `--eval`: 评估指标，取决于每个数据集的要求，例如 "mIoU" 是大多数据集的指标而 "cityscapes" 仅针对 Cityscapes 数据集。注意这个参数是用 `--format-only` 来 **手动添加**
-- `--show`: 是否展示结果
-- `--show-dir`: 涂上结果的图像被保存的文件夹的路径
-- `--cfg-options`: 重写配置文件里的一些设置，`xxx=yyy` 格式的键值对将被覆盖到配置文件里
-- `--eval-options`: 自定义的评估的选项， `xxx=yyy` 格式的键值对将成为  `dataset.evaluate()` 函数的参数变量
-- `--opacity`: 涂上结果的分割图的透明度，范围在 (0, 1\] 之间
-
-#### 结果和模型
-
-|    模型    |                    配置文件                     |   数据集   | 评价指标 | PyTorch | ONNXRuntime | TensorRT-fp32 | TensorRT-fp16 |
-| :--------: | :---------------------------------------------: | :--------: | :------: | :-----: | :---------: | :-----------: | :-----------: |
-|    FCN     |      fcn_r50-d8_512x1024_40k_cityscapes.py      | cityscapes |   mIoU   |  72.2   |    72.2     |     72.2      |     72.2      |
-|   PSPNet   |    pspnet_r50-d8_512x1024_40k_cityscapes.py     | cityscapes |   mIoU   |  77.8   |    77.8     |     77.8      |     77.8      |
-| deeplabv3  |   deeplabv3_r50-d8_512x1024_40k_cityscapes.py   | cityscapes |   mIoU   |  79.0   |    79.0     |     79.0      |     79.0      |
-| deeplabv3+ | deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py | cityscapes |   mIoU   |  79.6   |    79.5     |     79.5      |     79.5      |
-|   PSPNet   |     pspnet_r50-d8_769x769_40k_cityscapes.py     | cityscapes |   mIoU   |  78.2   |    78.1     |               |               |
-| deeplabv3  |   deeplabv3_r50-d8_769x769_40k_cityscapes.py    | cityscapes |   mIoU   |  78.5   |    78.3     |               |               |
-| deeplabv3+ | deeplabv3plus_r50-d8_769x769_40k_cityscapes.py  | cityscapes |   mIoU   |  78.9   |    78.7     |               |               |
-
-**注意**: TensorRT 仅在使用 `whole mode` 测试模式时的配置文件里可用。
-
-### 导出 TorchScript (试验性)
-
-我们同样提供一个脚本去把模型导出成 [TorchScript](https://pytorch.org/docs/stable/jit.html) 格式。您可以使用 pytorch C++ API [LibTorch](https://pytorch.org/docs/stable/cpp_index.html) 去推理训练好的模型。
-被转换的模型能被像 [Netron](https://github.com/lutzroeder/netron) 的工具来可视化。此外，我们还支持 PyTorch 和 TorchScript 模型的输出结果的比较。
-
-```shell
-python tools/pytorch2torchscript.py \
-    ${CONFIG_FILE} \
-    --checkpoint ${CHECKPOINT_FILE} \
-    --output-file ${ONNX_FILE}
-    --shape ${INPUT_SHAPE}
-    --verify \
-    --show
-```
-
-各个参数的描述:
-
-- `config` : pytorch 模型的配置文件的路径
-- `--checkpoint` : pytorch 模型的检查点文件的路径
-- `--output-file`: TorchScript 模型输出的路径，如果没有被专门指定，它将被设置成 `tmp.pt`
-- `--input-img` : 用来转换和可视化的输入图像的路径
-- `--shape`: 模型的输入张量的宽和高。如果没有被专门指定，它将被设置成 `512 512`
-- `--show`: 是否打印输出模型的追踪图 (traced graph)，如果没有被专门指定，它将被设置成 `False`
-- `--verify`: 是否验证一个输出模型的正确性 (correctness)，如果没有被专门指定，它将被设置成 `False`
-
-**注意**: 目前仅支持 PyTorch>=1.8.0 版本
-
-**注意**: 这个工具仍然是试验性的，一些自定义操作符目前还不被支持
-
-例子:
-
-- 导出 PSPNet 在 cityscapes 数据集上的 pytorch 模型
-
-  ```shell
-  python tools/pytorch2torchscript.py configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
-  --checkpoint checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
-  --output-file checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pt \
-  --shape 512 1024
-  ```
-
-### 导出 TensorRT (试验性)
-
-一个导出 [ONNX](https://github.com/onnx/onnx) 模型成 [TensorRT](https://developer.nvidia.com/tensorrt) 格式的脚本
-
-先决条件
-
-- 按照 [ONNXRuntime in mmcv](https://mmcv.readthedocs.io/en/latest/deployment/onnxruntime_op.html) 和 [TensorRT plugin in mmcv](https://github.com/open-mmlab/mmcv/blob/master/docs/en/deployment/tensorrt_plugin.md) ，用 ONNXRuntime 自定义运算 (custom ops) 和 TensorRT 插件安装 `mmcv-full`
-- 使用 [pytorch2onnx](#convert-to-onnx-experimental) 将模型从 PyTorch 转成 ONNX
-
-使用方法
-
-```bash
-python ${MMSEG_PATH}/tools/onnx2tensorrt.py \
-    ${CFG_PATH} \
-    ${ONNX_PATH} \
-    --trt-file ${OUTPUT_TRT_PATH} \
-    --min-shape ${MIN_SHAPE} \
-    --max-shape ${MAX_SHAPE} \
-    --input-img ${INPUT_IMG} \
-    --show \
-    --verify
-```
-
-各个参数的描述:
-
-- `config` : 模型的配置文件
-- `model` : 输入的 ONNX 模型的路径
-- `--trt-file` : 输出的 TensorRT 引擎的路径
-- `--max-shape` : 模型的输入的最大形状
-- `--min-shape` : 模型的输入的最小形状
-- `--fp16` : 做 fp16 模型转换
-- `--workspace-size` : 在 GiB 里的最大工作空间大小 (Max workspace size)
-- `--input-img` : 用来可视化的图像
-- `--show` : 做结果的可视化
-- `--dataset` : Palette provider, 默认为 `CityscapesDataset`
-- `--verify` : 验证 ONNXRuntime 和 TensorRT 的输出
-- `--verbose` : 当创建 TensorRT 引擎时，是否详细做信息日志。默认为 False
-
-**注意**: 仅在全图测试模式 (whole mode) 下测试过
-
-## 其他内容
-
-### 打印完整的配置文件
-
-`tools/print_config.py` 会逐字逐句的打印整个配置文件，展开所有的导入。
-
-```shell
-python tools/print_config.py \
-  ${CONFIG} \
-  --graph \
-  --cfg-options ${OPTIONS [OPTIONS...]} \
-```
-
-各个参数的描述:
-
-- `config` : pytorch 模型的配置文件的路径
-- `--graph` : 是否打印模型的图 (models graph)
-- `--cfg-options`: 自定义替换配置文件的选项
-
-### 对训练日志 (training logs) 画图
-
-`tools/analyze_logs.py` 会画出给定的训练日志文件的 loss/mIoU 曲线，首先需要 `pip install seaborn` 安装依赖包。
-
-```shell
-python tools/analyze_logs.py xxx.log.json [--keys ${KEYS}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}]
-```
-
-示例:
-
-- 对 mIoU, mAcc, aAcc 指标画图
-
-  ```shell
-  python tools/analyze_logs.py log.json --keys mIoU mAcc aAcc --legend mIoU mAcc aAcc
-  ```
-
-- 对 loss 指标画图
-
-  ```shell
-  python tools/analyze_logs.py log.json --keys loss --legend loss
-  ```
-
-### 转换其他仓库的权重
-
-`tools/model_converters/` 提供了若干个预训练权重转换脚本，支持将其他仓库的预训练权重的 key 转换为与 MMSegmentation 相匹配的 key。
-
-#### ViT Swin MiT Transformer 模型
-
-- ViT
-
-`tools/model_converters/vit2mmseg.py` 将 timm 预训练模型转换到 MMSegmentation。
-
-```shell
-python tools/model_converters/vit2mmseg.py ${SRC} ${DST}
-```
-
-- Swin
-
-  `tools/model_converters/swin2mmseg.py` 将官方预训练模型转换到 MMSegmentation。
-
-  ```shell
-  python tools/model_converters/swin2mmseg.py ${SRC} ${DST}
-  ```
-
-- SegFormer
-
-  `tools/model_converters/mit2mmseg.py` 将官方预训练模型转换到 MMSegmentation。
-
-  ```shell
-  python tools/model_converters/mit2mmseg.py ${SRC} ${DST}
-  ```
-
-## 模型服务
-
-为了用 [`TorchServe`](https://pytorch.org/serve/) 服务 `MMSegmentation` 的模型 ， 您可以遵循如下流程:
-
-### 1. 将 model 从　MMSegmentation 转换到 TorchServe
-
-```shell
-python tools/mmseg2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \
---output-folder ${MODEL_STORE} \
---model-name ${MODEL_NAME}
-```
-
-**注意**: ${MODEL_STORE} 需要设置为某个文件夹的绝对路径
-
-### 2. 构建 `mmseg-serve` 容器镜像 (docker image)
-
-```shell
-docker build -t mmseg-serve:latest docker/serve/
-```
-
-### 3. 运行 `mmseg-serve`
-
-请查阅官方文档: [使用容器运行 TorchServe](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment)
-
-为了在 GPU 环境下使用, 您需要安装 [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). 若在 CPU 环境下使用，您可以忽略添加 `--gpus` 参数。
-
-示例:
-
-```shell
-docker run --rm \
---cpus 8 \
---gpus device=0 \
--p8080:8080 -p8081:8081 -p8082:8082 \
---mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \
-mmseg-serve:latest
-```
-
-阅读关于推理 (8080), 管理 (8081) 和指标 (8082) APIs 的 [文档](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md) 。
-
-### 4. 测试部署
-
-```shell
-curl -O https://raw.githubusercontent.com/open-mmlab/mmsegmentation/master/resources/3dogs.jpg
-curl http://127.0.0.1:8080/predictions/${MODEL_NAME} -T 3dogs.jpg -o 3dogs_mask.png
-```
-
-得到的响应将是一个 ".png" 的分割掩码.
-
-您可以按照如下方法可视化输出:
-
-```python
-import matplotlib.pyplot as plt
-import mmcv
-plt.imshow(mmcv.imread("3dogs_mask.png", "grayscale"))
-plt.show()
-```
-
-看到的东西将会和下图类似:
-
-![3dogs_mask](../../resources/3dogs_mask.png)
-
-然后您可以使用 `test_torchserve.py` 比较 torchserve 和 pytorch 的结果，并将它们可视化。
-
-```shell
-python tools/torchserve/test_torchserve.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME}
-[--inference-addr ${INFERENCE_ADDR}] [--result-image ${RESULT_IMAGE}] [--device ${DEVICE}]
-```
-
-示例：
-
-```shell
-python tools/torchserve/test_torchserve.py \
-demo/demo.png \
-configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py \
-checkpoint/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth \
-fcn
-```
diff --git a/docs/zh_cn/user_guides/1_config.md b/docs/zh_cn/user_guides/1_config.md
new file mode 100644
index 0000000000..dfcf0f9655
--- /dev/null
+++ b/docs/zh_cn/user_guides/1_config.md
@@ -0,0 +1,577 @@
+# 教程1：了解配置文件
+
+我们将模块化和继承性设计融入到我们的配置文件系统中，方便进行各种实验。如果您想查看配置文件，你可以运行 `python tools/misc/print_config.py /PATH/TO/CONFIG` 来查看完整的配置文件。你也可以通过传递参数 `--cfg-options xxx.yyy=zzz` 来查看更新的配置信息。
+
+## 配置文件的结构
+
+在  `config/_base_ ` 文件夹下面有4种基本组件类型： 数据集(dataset)，模型(model)，训练策略(schedule)和运行时的默认设置(default runtime)。许多模型都可以很容易地通过组合这些组件进行实现，比如 DeepLabV3，PSPNet。使用 `_base_` 下的组件构建的配置信息叫做原始配置 (primitive)。
+
+对于同一个文件夹下的所有配置文件，建议**只有一个**对应的**原始配置文件**。所有其他的配置文件都应该继承自这个原始配置文件，从而保证每个配置文件的最大继承深度为 3。
+
+为了便于理解，我们建议社区贡献者从现有的方法继承。例如，如果您在 DeepLabV3 基础上进行了一些修改，用户可以先通过指定 `_base_ = ../deeplabv3/deeplabv3_r50-d8_4xb2-40k_cityscapes-512x1024.py` 继承基本的 DeepLabV3 结构，然后在配置文件中修改必要的字段。
+
+如果你正在构建一个全新的方法，它不与现有的任何方法共享基本组件，您可以在`config`下创建一个新的文件夹`xxxnet` ，详细文档请参考[mmengine](https://mmengine.readthedocs.io/en/latest/tutorials/config.html)。
+
+## 配置文件命名风格
+
+我们遵循以下格式来命名配置文件，建议社区贡献者遵循相同的风格。
+
+```text
+{algorithm name}_{model component names [component1]_[component2]_[...]}_{training settings}_{training dataset information}_{testing dataset information}
+```
+
+配置文件的文件名分为五个部分，组成文件名每一个部分和组件之间都用`_`连接，每个部分或组件中的每个单词都要用`-`连接。
+
+- `{algorithm name}`: 算法的名称，如 `deeplabv3`,  `pspnet` 等。
+- `{model component names}`: 算法中使用的组件名称，如主干(backbone)、解码头(head)等。例如，`r50-d8 `表示使用ResNet50主干网络，并使用主干网络的8倍下采样输出作为下一级的输入。
+- `{training settings}`: 训练时的参数设置，如 `batch size`、数据增强(augmentation)、损失函数(loss)、学习率调度器(learning rate scheduler)和训练轮数(epochs/iterations)。例如: `4xb4-ce-linearlr-40K` 意味着使用4个gpu，每个gpu4个图像，使用交叉熵损失函数(CrossEntropy)，线性学习率调度程序，训练40K iterations。
+  一些缩写:
+  - `{gpu x batch_per_gpu}`: GPU数量和每个GPU的样本数。`bN ` 表示每个GPU的batch size为N，如 `8xb2` 为8个gpu x 每个gpu2张图像的缩写。如果未提及，则默认使用 `4xb4 `。
+  - `{schedule}`: 训练计划，选项有`20k`，`40k`等。`20k ` 和 `40k` 分别表示20000次迭代(iterations)和40000次迭代(iterations)。
+- `{training dataset information}`: 训练数据集名称，如 `cityscapes `， `ade20k ` 等，以及输入分辨率。例如: `cityscapes-768x768  `表示使用 `cityscapes` 数据集进行训练，输入分辨率为`768x768 `。
+- `{testing dataset information}` (可选): 测试数据集名称。当您的模型在一个数据集上训练但在另一个数据集上测试时，请将测试数据集名称添加到此处。如果没有这一部分，则意味着模型是在同一个数据集上进行训练和测试的。
+
+## PSPNet 的一个例子
+
+为了帮助用户熟悉对这个现代语义分割系统的完整配置文件和模块，我们对使用ResNet50V1c作为主干网络的PSPNet的配置文件作如下的简要注释和说明。要了解更详细的用法和每个模块对应的替换方法，请参阅API文档。
+
+```python
+_base_ = [
+    '../_base_/models/pspnet_r50-d8.py', '../_base_/datasets/cityscapes.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_40k.py'
+] # 我们可以在基本配置文件的基础上 构建新的配置文件
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
+```
+
+`_base_/models/pspnet_r50-d8.py`是使用ResNet50V1c作为主干网络的PSPNet的基本模型配置文件。
+
+```python
+# 模型设置
+norm_cfg = dict(type='SyncBN', requires_grad=True)  # 分割框架通常使用 SyncBN
+data_preprocessor = dict(  # 数据预处理的配置项，通常包括图像的归一化和增强
+    type='SegDataPreProcessor',  # 数据预处理的类型
+    mean=[123.675, 116.28, 103.53],  # 用于归一化输入图像的平均值
+    std=[58.395, 57.12, 57.375],  # 用于归一化输入图像的标准差
+    bgr_to_rgb=True,  # 是否将图像从 BGR 转为 RGB
+    pad_val=0,  # 图像的填充值
+    seg_pad_val=255)  # 'gt_seg_map'的填充值
+model = dict(
+    type='EncoderDecoder',  # 分割器(segmentor)的名字
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',  # 加载使用 ImageNet 预训练的主干网络
+    backbone=dict(
+        type='ResNetV1c',  # 主干网络的类别，更多细节请参考 mmseg/models/backbones/resnet.py
+        depth=50,  # 主干网络的深度，通常为 50 和 101
+        num_stages=4,  # 主干网络状态(stages)的数目
+        out_indices=(0, 1, 2, 3),  # 每个状态(stage)产生的特征图输出的索引
+        dilations=(1, 1, 2, 4),  # 每一层(layer)的空心率(dilation rate)
+        strides=(1, 2, 1, 1),  # 每一层(layer)的步长(stride)
+        norm_cfg=norm_cfg,  # 归一化层(norm layer)的配置项
+        norm_eval=False,  # 是否冻结 BN 里的统计项
+        style='pytorch',  # 主干网络的风格，'pytorch' 意思是步长为2的层为 3x3 卷积， 'caffe' 意思是步长为2的层为 1x1 卷积
+        contract_dilation=True),  # 当空洞率 > 1, 是否压缩第一个空洞层
+    decode_head=dict(
+        type='PSPHead',  # 解码头(decode head)的类别。可用选项请参 mmseg/models/decode_heads
+        in_channels=2048,  # 解码头的输入通道数
+        in_index=3,  # 被选择特征图(feature map)的索引
+        channels=512,  # 解码头中间态(intermediate)的通道数
+        pool_scales=(1, 2, 3, 6),  # PSPHead 平均池化(avg pooling)的规模(scales)。 细节请参考文章内容
+        dropout_ratio=0.1,  # 进入最后分类层(classification layer)之前的 dropout 比例
+        num_classes=19,  # 分割前景的种类数目。 通常情况下，cityscapes 为19，VOC为21，ADE20k 为150
+        norm_cfg=norm_cfg,  # 归一化层的配置项
+        align_corners=False,  # 解码过程中调整大小(resize)的 align_corners 参数
+        loss_decode=dict(  # 解码头(decode_head)里的损失函数的配置项
+            type='CrossEntropyLoss',  # 分割时使用的损失函数的类别
+            use_sigmoid=False,  # 分割时是否使用 sigmoid 激活
+            loss_weight=1.0)),  # 解码头的损失权重
+    auxiliary_head=dict(
+        type='FCNHead',  # 辅助头(auxiliary head)的种类。可用选项请参考 mmseg/models/decode_heads
+        in_channels=1024,  # 辅助头的输入通道数
+        in_index=2,  # 被选择的特征图(feature map)的索引
+        channels=256,  # 辅助头中间态(intermediate)的通道数
+        num_convs=1,  # FCNHead 里卷积(convs)的数目，辅助头中通常为1
+        concat_input=False,  # 在分类层(classification layer)之前是否连接(concat)输入和卷积的输出
+        dropout_ratio=0.1,  # 进入最后分类层(classification layer)之前的 dropout 比例
+        num_classes=19,  # 分割前景的种类数目。 通常情况下，cityscapes 为19，VOC为21，ADE20k 为150
+        norm_cfg=norm_cfg,  # 归一化层的配置项
+        align_corners=False,  # 解码过程中调整大小(resize)的 align_corners 参数
+        loss_decode=dict(  # 辅助头(auxiliary head)里的损失函数的配置项
+            type='CrossEntropyLoss',  # 分割时使用的损失函数的类别
+            use_sigmoid=False,  # 分割时是否使用 sigmoid 激活
+            loss_weight=0.4)),  # 辅助头损失的权重，默认设置为0.4
+    # 模型训练和测试设置项
+    train_cfg=dict(),  # train_cfg 当前仅是一个占位符
+    test_cfg=dict(mode='whole'))  # 测试模式，可选参数为 'whole' 和 'slide'. 'whole': 在整张图像上全卷积(fully-convolutional)测试。 'slide': 在输入图像上做滑窗预测
+```
+
+`_base_/datasets/cityscapes.py`是数据集的基本配置文件。
+
+```python
+# 数据集设置
+dataset_type = 'CityscapesDataset'  # 数据集类型，这将被用来定义数据集
+data_root = 'data/cityscapes/'  # 数据的根路径
+crop_size = (512, 1024)  # 训练时的裁剪大小
+train_pipeline = [  # 训练流程
+    dict(type='LoadImageFromFile'),  # 第1个流程，从文件路径里加载图像
+    dict(type='LoadAnnotations'),  # 第2个流程，对于当前图像，加载它的标注图像
+    dict(type='RandomResize',  # 调整输入图像大小(resize)和其标注图像的数据增广流程
+        scale=(2048, 1024),  # 图像裁剪的大小
+        ratio_range=(0.5, 2.0),  # 数据增广的比例范围
+        keep_ratio=True),  # 调整图像大小时是否保持纵横比
+    dict(type='RandomCrop',  # 随机裁剪当前图像和其标注图像的数据增广流程
+        crop_size=crop_size,  # 随机裁剪的大小
+        cat_max_ratio=0.75),  # 单个类别可以填充的最大区域的比
+    dict(type='RandomFlip',  # 翻转图像和其标注图像的数据增广流程
+        prob=0.5),  # 翻转图像的概率
+    dict(type='PhotoMetricDistortion'),  # 光学上使用一些方法扭曲当前图像和其标注图像的数据增广流程
+    dict(type='PackSegInputs')  # 打包用于语义分割的输入数据
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),  # 第1个流程，从文件路径里加载图像
+    dict(type='Resize',  # 使用调整图像大小(resize)增强
+        scale=(2048, 1024),  # 图像缩放的大小
+        keep_ratio=True),  # 在调整图像大小时是否保留长宽比
+    # 在' Resize '之后添加标注图像
+    # 不需要做调整图像大小(resize)的数据变换
+    dict(type='LoadAnnotations'),  # 加载数据集提供的语义分割标注
+    dict(type='PackSegInputs')  # 打包用于语义分割的输入数据
+]
+train_dataloader = dict(  # 训练数据加载器(dataloader)的配置
+    batch_size=2,  # 每一个GPU的batch size大小
+    num_workers=2,  # 为每一个GPU预读取数据的进程个数
+    persistent_workers=True,  # 在一个epoch结束后关闭worker进程，可以加快训练速度
+    sampler=dict(type='InfiniteSampler', shuffle=True),  # 训练时进行随机洗牌(shuffle)
+    dataset=dict(  # 训练数据集配置
+        type=dataset_type,  # 数据集类型，详见mmseg/datassets/
+        data_root=data_root,  # 数据集的根目录
+        data_prefix=dict(
+            img_path='leftImg8bit/train', seg_map_path='gtFine/train'),  # 训练数据的前缀
+        pipeline=train_pipeline)) # 数据处理流程，它通过之前创建的train_pipeline传递。
+val_dataloader = dict(
+    batch_size=1,  # 每一个GPU的batch size大小
+    num_workers=4,  # 为每一个GPU预读取数据的进程个数
+    persistent_workers=True,  # 在一个epoch结束后关闭worker进程，可以加快训练速度
+    sampler=dict(type='DefaultSampler', shuffle=False),  # 训练时不进行随机洗牌(shuffle)
+    dataset=dict(  # 测试数据集配置
+        type=dataset_type,  # 数据集类型，详见mmseg/datassets/
+        data_root=data_root,  # 数据集的根目录
+        data_prefix=dict(
+            img_path='leftImg8bit/val', seg_map_path='gtFine/val'),  # 测试数据的前缀
+        pipeline=test_pipeline))  # 数据处理流程，它通过之前创建的test_pipeline传递。
+test_dataloader = val_dataloader
+# 精度评估方法，我们在这里使用 IoUMetric 进行评估
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
+```
+
+`_base_/schedules/schedule_40k.py`
+
+```python
+# optimizer
+optimizer = dict(type='SGD', # 优化器种类，更多细节可参考 https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/optimizer/default_constructor.py
+                lr=0.01,  # 优化器的学习率，参数的使用细节请参照对应的 PyTorch 文档
+                momentum=0.9,  # 动量大小 (Momentum)
+                weight_decay=0.0005)  # SGD 的权重衰减 (weight decay)
+optim_wrapper = dict(type='OptimWrapper',  # 优化器包装器(Optimizer wrapper)为更新参数提供了一个公共接口
+                    optimizer=optimizer,  # 用于更新模型参数的优化器(Optimizer)
+                    clip_grad=None)  # 如果 'clip_grad' 不是None，它将是 ' torch.nn.utils.clip_grad' 的参数。
+# 学习策略
+param_scheduler = [
+    dict(
+        type='PolyLR',  # 调度流程的策略，同样支持 Step, CosineAnnealing, Cyclic 等. 请从 https://github.com/open-mmlab/mmengine/blob/main/mmengine/optim/scheduler/lr_scheduler.py 参考 LrUpdater 的细节
+        eta_min=1e-4,  # 训练结束时的最小学习率
+        power=0.9,  # 多项式衰减 (polynomial decay) 的幂
+        begin=0,  # 开始更新参数的时间步(step)
+        end=40000,  # 停止更新参数的时间步(step)
+        by_epoch=False)  # 是否按照 epoch 计算训练时间
+]
+# 40k iteration 的训练计划
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=40000, val_interval=4000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+# 默认钩子(hook)配置
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),  # 记录迭代过程中花费的时间
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),  # 从'Runner'的不同组件收集和写入日志
+    param_scheduler=dict(type='ParamSchedulerHook'),  # 更新优化器中的一些超参数，例如学习率
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),  # 定期保存检查点(checkpoint)
+    sampler_seed=dict(type='DistSamplerSeedHook'))  # 用于分布式训练的数据加载采样器
+```
+
+in `_base_/default_runtime.py`
+
+```python
+# 将注册表的默认范围设置为mmseg
+default_scope = 'mmseg'
+# environment
+env_cfg = dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+load_from = None  # 从文件中加载检查点(checkpoint)
+resume = False  # 是否从已有的模型恢复
+```
+
+这些都是用于训练和测试PSPNet的配置文件，要加载和解析它们，我们可以使用[MMEngine](https://github.com/open-mmlab/mmengine)实现的[Config](https://mmengine.readthedocs.io/en/latest/tutorials/config.html)。
+
+```python
+from mmengine.config import Config
+
+cfg = Config.fromfile('configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py')
+print(cfg.train_dataloader)
+```
+
+```shell
+{'batch_size': 2,
+ 'num_workers': 2,
+ 'persistent_workers': True,
+ 'sampler': {'type': 'InfiniteSampler', 'shuffle': True},
+ 'dataset': {'type': 'CityscapesDataset',
+  'data_root': 'data/cityscapes/',
+  'data_prefix': {'img_path': 'leftImg8bit/train',
+   'seg_map_path': 'gtFine/train'},
+  'pipeline': [{'type': 'LoadImageFromFile'},
+   {'type': 'LoadAnnotations'},
+   {'type': 'RandomResize',
+    'scale': (2048, 1024),
+    'ratio_range': (0.5, 2.0),
+    'keep_ratio': True},
+   {'type': 'RandomCrop', 'crop_size': (512, 1024), 'cat_max_ratio': 0.75},
+   {'type': 'RandomFlip', 'prob': 0.5},
+   {'type': 'PhotoMetricDistortion'},
+   {'type': 'PackSegInputs'}]}}
+```
+
+`cfg `是`mmengine.config.Config `的一个实例。它的接口与dict对象相同，也允许将配置值作为属性访问。更多信息请参见[MMEngine](https://github.com/open-mmlab/mmengine)中的[config tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/config.html)。
+
+## FAQ
+
+### 忽略基础配置文件里的一些字段
+
+有时，您可以设置`_delete_=True `来忽略基本配置文件中的某些字段。您可以参考[MMEngine](https://github.com/open-mmlab/mmengine)中的[config tutorial](https://mmengine.readthedocs.io/en/latest/tutorials/config.html)来获得一些简单的指导。
+
+例如，在MMSegmentation中，如果您想在下面的配置文件`pspnet.py `中修改PSPNet的主干网络:
+
+```python
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    type='EncoderDecoder',
+    pretrained='torchvision://resnet50',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='PSPHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        pool_scales=(1, 2, 3, 6),
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)))
+```
+
+用以下代码加载并解析配置文件`pspnet.py`:
+
+```python
+from mmengine.config import Config
+
+cfg = Config.fromfile('pspnet.py')
+print(cfg.model)
+```
+
+```shell
+{'type': 'EncoderDecoder',
+ 'pretrained': 'torchvision://resnet50',
+ 'backbone': {'type': 'ResNetV1c',
+  'depth': 50,
+  'num_stages': 4,
+  'out_indices': (0, 1, 2, 3),
+  'dilations': (1, 1, 2, 4),
+  'strides': (1, 2, 1, 1),
+  'norm_cfg': {'type': 'SyncBN', 'requires_grad': True},
+  'norm_eval': False,
+  'style': 'pytorch',
+  'contract_dilation': True},
+ 'decode_head': {'type': 'PSPHead',
+  'in_channels': 2048,
+  'in_index': 3,
+  'channels': 512,
+  'pool_scales': (1, 2, 3, 6),
+  'dropout_ratio': 0.1,
+  'num_classes': 19,
+  'norm_cfg': {'type': 'SyncBN', 'requires_grad': True},
+  'align_corners': False,
+  'loss_decode': {'type': 'CrossEntropyLoss',
+   'use_sigmoid': False,
+   'loss_weight': 1.0}}}
+```
+
+`ResNet`和`HRNet`使用不同的关键字构建，编写一个新的配置文件`hrnet.py`，如下所示:
+
+```python
+_base_ = 'pspnet.py'
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    pretrained='open-mmlab://msra/hrnetv2_w32',
+    backbone=dict(
+        _delete_=True,
+        type='HRNet',
+        norm_cfg=norm_cfg,
+        extra=dict(
+            stage1=dict(
+                num_modules=1,
+                num_branches=1,
+                block='BOTTLENECK',
+                num_blocks=(4, ),
+                num_channels=(64, )),
+            stage2=dict(
+                num_modules=1,
+                num_branches=2,
+                block='BASIC',
+                num_blocks=(4, 4),
+                num_channels=(32, 64)),
+            stage3=dict(
+                num_modules=4,
+                num_branches=3,
+                block='BASIC',
+                num_blocks=(4, 4, 4),
+                num_channels=(32, 64, 128)),
+            stage4=dict(
+                num_modules=3,
+                num_branches=4,
+                block='BASIC',
+                num_blocks=(4, 4, 4, 4),
+                num_channels=(32, 64, 128, 256)))))
+```
+
+用以下代码加载并解析配置文件`hrnet.py`:
+
+```python
+from mmengine.config import Config
+cfg = Config.fromfile('hrnet.py')
+print(cfg.model)
+```
+
+```shell
+{'type': 'EncoderDecoder',
+ 'pretrained': 'open-mmlab://msra/hrnetv2_w32',
+ 'backbone': {'type': 'HRNet',
+  'norm_cfg': {'type': 'SyncBN', 'requires_grad': True},
+  'extra': {'stage1': {'num_modules': 1,
+    'num_branches': 1,
+    'block': 'BOTTLENECK',
+    'num_blocks': (4,),
+    'num_channels': (64,)},
+   'stage2': {'num_modules': 1,
+    'num_branches': 2,
+    'block': 'BASIC',
+    'num_blocks': (4, 4),
+    'num_channels': (32, 64)},
+   'stage3': {'num_modules': 4,
+    'num_branches': 3,
+    'block': 'BASIC',
+    'num_blocks': (4, 4, 4),
+    'num_channels': (32, 64, 128)},
+   'stage4': {'num_modules': 3,
+    'num_branches': 4,
+    'block': 'BASIC',
+    'num_blocks': (4, 4, 4, 4),
+    'num_channels': (32, 64, 128, 256)}}},
+ 'decode_head': {'type': 'PSPHead',
+  'in_channels': 2048,
+  'in_index': 3,
+  'channels': 512,
+  'pool_scales': (1, 2, 3, 6),
+  'dropout_ratio': 0.1,
+  'num_classes': 19,
+  'norm_cfg': {'type': 'SyncBN', 'requires_grad': True},
+  'align_corners': False,
+  'loss_decode': {'type': 'CrossEntropyLoss',
+   'use_sigmoid': False,
+   'loss_weight': 1.0}}}
+```
+
+`_delete_=True` 将用新的键去替换 `backbone` 字段内所有旧的键。
+
+### 使用配置文件里的中间变量
+
+配置文件中会使用一些中间变量，例如数据集(datasets)字段里的 `train_pipeline`/`test_pipeline`。 需要注意的是，在子配置文件里修改中间变量时，您需要再次传递这些变量给对应的字段。例如，我们想改变在训练或测试PSPNet时采用的多尺度策略 (multi scale strategy)，`train_pipeline`/`test_pipeline` 是我们需要修改的中间变量。
+
+```python
+_base_ = '../pspnet/pspnet_r50-d8_4xb4-40k_cityscpaes-512x1024.py'
+crop_size = (512, 1024)
+img_norm_cfg = dict(
+    mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='RandomResize',
+         img_scale=(2048, 1024),
+         ratio_range=(1., 2.),
+         keep_ration=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', flip_ratio=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize',
+        scale=(2048, 1024),
+        keep_ratio=True),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='leftImg8bit/train', seg_map_path='gtFine/train'),
+        pipeline=train_pipeline)
+test_dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='leftImg8bit/val', seg_map_path='gtFine/val'),
+        pipeline=test_pipeline)
+train_dataloader = dict(dataset=train_dataset)
+val_dataloader = dict(dataset=test_dataset)
+test_dataloader = val_dataloader
+```
+
+我们首先需要定义新的 `train_pipeline`/`test_pipeline` 然后传递到 `dataset` 里。
+
+类似的，如果我们想从 `SyncBN` 切换到 `BN` 或者 `MMSyncBN`，我们需要替换配置文件里的每一个 `norm_cfg`。
+
+```python
+_base_ = '../pspnet/pspnet_r50-d8_4xb4-40k_cityscpaes-512x1024.py'
+norm_cfg = dict(type='BN', requires_grad=True)
+model = dict(
+    backbone=dict(norm_cfg=norm_cfg),
+    decode_head=dict(norm_cfg=norm_cfg),
+    auxiliary_head=dict(norm_cfg=norm_cfg))
+```
+
+## 通过脚本参数修改配置文件
+
+在[training script](https://github.com/open-mmlab/mmsegmentation/blob/1.x/tools/train.py)和[testing script](https://github.com/open-mmlab/mmsegmentation/blob/1.x/tools/test.py)中，我们支持脚本参数 `--cfg-options`，它可以帮助用户覆盖所使用的配置中的一些设置，`xxx=yyy` 格式的键值对将合并到配置文件中。
+
+例如，这是一个简化的脚本 `demo_script.py `:
+
+```python
+import argparse
+
+from mmengine.config import Config, DictAction
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Script Example')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+def main():
+    args = parse_args()
+
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    print(cfg)
+
+if __name__ == '__main__':
+    main()
+```
+
+一个配置文件示例 `demo_config.py` 如下所示:
+
+```python
+backbone = dict(
+    type='ResNetV1c',
+    depth=50,
+    num_stages=4,
+    out_indices=(0, 1, 2, 3),
+    dilations=(1, 1, 2, 4),
+    strides=(1, 2, 1, 1),
+    norm_eval=False,
+    style='pytorch',
+    contract_dilation=True)
+```
+
+运行 `demo_script.py`:
+
+```shell
+python demo_script.py demo_config.py
+```
+
+```shell
+Config (path: demo_config.py): {'backbone': {'type': 'ResNetV1c', 'depth': 50, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'dilations': (1, 1, 2, 4), 'strides': (1, 2, 1, 1), 'norm_eval': False, 'style': 'pytorch', 'contract_dilation': True}}
+```
+
+通过脚本参数修改配置:
+
+```shell
+python demo_script.py demo_config.py --cfg-options backbone.depth=101
+```
+
+```shell
+Config (path: demo_config.py): {'backbone': {'type': 'ResNetV1c', 'depth': 101, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'dilations': (1, 1, 2, 4), 'strides': (1, 2, 1, 1), 'norm_eval': False, 'style': 'pytorch', 'contract_dilation': True}}
+```
+
+- 更新列表/元组的值。
+
+  如果要更新的值是一个 list 或 tuple。例如，需要在配置文件 `demo_config.py ` 的 `backbone ` 中设置 `stride =(1,2,1,1) `。
+  如果您想更改这个键，你可以用两种方式进行指定:
+
+  1. `--cfg-options backbone.strides="(1, 1, 1, 1)"`. 注意引号 " 是支持 list/tuple 数据类型所必需的。
+
+     ```shell
+     python demo_script.py demo_config.py --cfg-options backbone.strides="(1, 1, 1, 1)"
+     ```
+
+     ```shell
+     Config (path: demo_config.py): {'backbone': {'type': 'ResNetV1c', 'depth': 50, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'dilations': (1, 1, 2, 4), 'strides': (1, 1, 1, 1), 'norm_eval': False, 'style': 'pytorch', 'contract_dilation': True}}
+     ```
+
+  2. `--cfg-options backbone.strides=1,1,1,1`. 注意，在指定的值中**不允许**有空格。
+
+     另外，如果原来的类型是tuple，通过这种方式修改后会自动转换为list。
+
+     ```shell
+     python demo_script.py demo_config.py --cfg-options backbone.strides=1,1,1,1
+     ```
+
+     ```shell
+     Config (path: demo_config.py): {'backbone': {'type': 'ResNetV1c', 'depth': 50, 'num_stages': 4, 'out_indices': (0, 1, 2, 3), 'dilations': (1, 1, 2, 4), 'strides': [1, 1, 1, 1], 'norm_eval': False, 'style': 'pytorch', 'contract_dilation': True}}
+     ```
+
+```{note}
+  这种修改方法仅支持修改string、int、float、boolean、None、list和tuple类型的配置项。
+  具体来说，对于list和tuple类型的配置项，它们内部的元素也必须是上述七种类型之一。
+```
diff --git a/docs/zh_cn/user_guides/2_dataset_prepare.md b/docs/zh_cn/user_guides/2_dataset_prepare.md
new file mode 100644
index 0000000000..5532624bef
--- /dev/null
+++ b/docs/zh_cn/user_guides/2_dataset_prepare.md
@@ -0,0 +1,750 @@
+# 教程2：准备数据集
+
+我们建议将数据集根目录符号链接到 `$MMSEGMENTATION/data`。
+如果您的目录结构不同，您可能需要更改配置文件中相应的路径。
+对于中国境内的用户，我们也推荐通过开源数据平台 [OpenDataLab](https://opendatalab.com/) 来下载dsdl标准数据，以获得更好的下载和使用体验，这里有一个下载dsdl数据集并进行训练的案例[DSDLReadme](../../../configs/dsdl/README.md)，欢迎尝试。
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── cityscapes
+│   │   ├── leftImg8bit
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── gtFine
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── VOCdevkit
+│   │   ├── VOC2012
+│   │   │   ├── JPEGImages
+│   │   │   ├── SegmentationClass
+│   │   │   ├── ImageSets
+│   │   │   │   ├── Segmentation
+│   │   ├── VOC2010
+│   │   │   ├── JPEGImages
+│   │   │   ├── SegmentationClassContext
+│   │   │   ├── ImageSets
+│   │   │   │   ├── SegmentationContext
+│   │   │   │   │   ├── train.txt
+│   │   │   │   │   ├── val.txt
+│   │   │   ├── trainval_merged.json
+│   │   ├── VOCaug
+│   │   │   ├── dataset
+│   │   │   │   ├── cls
+│   ├── ade
+│   │   ├── ADEChallengeData2016
+│   │   │   ├── annotations
+│   │   │   │   ├── training
+│   │   │   │   ├── validation
+│   │   │   ├── images
+│   │   │   │   ├── training
+│   │   │   │   ├── validation
+│   ├── coco_stuff10k
+│   │   ├── images
+│   │   │   ├── train2014
+│   │   │   ├── test2014
+│   │   ├── annotations
+│   │   │   ├── train2014
+│   │   │   ├── test2014
+│   │   ├── imagesLists
+│   │   │   ├── train.txt
+│   │   │   ├── test.txt
+│   │   │   ├── all.txt
+│   ├── coco_stuff164k
+│   │   ├── images
+│   │   │   ├── train2017
+│   │   │   ├── val2017
+│   │   ├── annotations
+│   │   │   ├── train2017
+│   │   │   ├── val2017
+│   ├── CHASE_DB1
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   ├── DRIVE
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   ├── HRF
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   ├── STARE
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+|   ├── dark_zurich
+|   │   ├── gps
+|   │   │   ├── val
+|   │   │   └── val_ref
+|   │   ├── gt
+|   │   │   └── val
+|   │   ├── LICENSE.txt
+|   │   ├── lists_file_names
+|   │   │   ├── val_filenames.txt
+|   │   │   └── val_ref_filenames.txt
+|   │   ├── README.md
+|   │   └── rgb_anon
+|   │   |   ├── val
+|   │   |   └── val_ref
+|   ├── NighttimeDrivingTest
+|   |   ├── gtCoarse_daytime_trainvaltest
+|   |   │   └── test
+|   |   │       └── night
+|   |   └── leftImg8bit
+|   |   |   └── test
+|   |   |       └── night
+│   ├── loveDA
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   │   ├── test
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── potsdam
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── vaihingen
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── iSAID
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   │   ├── test
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── synapse
+│   │   ├── img_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   │   ├── ann_dir
+│   │   │   ├── train
+│   │   │   ├── val
+│   ├── REFUGE
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── test
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── test
+│   ├── mapillary
+│   │   ├── training
+│   │   │   ├── images
+│   │   │   ├── v1.2
+|   │   │   │   ├── instances
+|   │   │   │   ├── labels
+|   │   │   │   └── panoptic
+│   │   │   ├── v2.0
+|   │   │   │   ├── instances
+|   │   │   │   ├── labels
+|   │   │   │   ├── panoptic
+|   │   │   │   └── polygons
+│   │   ├── validation
+│   │   │   ├── images
+|   │   │   ├── v1.2
+|   │   │   │   ├── instances
+|   │   │   │   ├── labels
+|   │   │   │   └── panoptic
+│   │   │   ├── v2.0
+|   │   │   │   ├── instances
+|   │   │   │   ├── labels
+|   │   │   │   ├── panoptic
+|   │   │   │   └── polygons
+│   ├── bdd100k
+│   │   ├── images
+│   │   │   └── 10k
+|   │   │   │   ├── test
+|   │   │   │   ├── train
+|   │   │   │   └── val
+│   │   └── labels
+│   │   │   └── sem_seg
+|   │   │   │   ├── colormaps
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── masks
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── polygons
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+|   │   │   │   └── rles
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+│   ├── nyu
+│   │   ├── images
+│   │   │   ├── train
+│   │   │   ├── test
+│   │   ├── annotations
+│   │   │   ├── train
+│   │   │   ├── test
+```
+
+## 用 MIM 下载数据集
+
+通过使用 [OpenXLab](https://openxlab.org.cn/datasets)，您可以直接下载开源数据集。通过平台的搜索功能，您可以快速轻松地找到他们正在寻找的数据集。使用平台上的格式化数据集，您可以高效地跨数据集执行任务。
+
+如果您使用 MIM 下载，请确保版本大于 v0.3.8。您可以使用以下命令进行更新、安装、登录和数据集下载：
+
+```shell
+# upgrade your MIM
+pip install -U openmim
+
+# install OpenXLab CLI tools
+pip install -U openxlab
+# log in OpenXLab
+openxlab login
+
+# download ADE20K by MIM
+mim download mmsegmentation --dataset ade20k
+```
+
+## Cityscapes
+
+Cityscapes [官方网站](https://www.cityscapes-dataset.com/)可以下载 Cityscapes 数据集，按照官网要求注册并登陆后，数据可以在[这里](https://www.cityscapes-dataset.com/downloads/)找到。
+
+按照惯例，`**labelTrainIds.png` 用于 cityscapes 训练。
+我们提供了一个基于 [cityscapesscripts](https://github.com/mcordts/cityscapesScripts) 的[脚本](https://github.com/open-mmlab/mmsegmentation/blob/1.x/tools/dataset_converters/cityscapes.py)用于生成 `**labelTrainIds.png`。
+
+```shell
+# --nproc 表示 8 个转换进程，也可以省略。
+python tools/dataset_converters/cityscapes.py data/cityscapes --nproc 8
+```
+
+## Pascal VOC
+
+Pascal VOC 2012 可从[此处](http://host.robots.ox.ac.uk/pascal/VOC/voc2012/VOCtrainval_11-May-2012.tar)下载。
+此外，Pascal VOC 数据集的最新工作通常利用额外的增强数据，可以在[这里](http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/semantic_contours/benchmark.tgz)找到。
+
+如果您想使用增强的 VOC 数据集，请运行以下命令将增强数据的标注转换为正确的格式。
+
+```shell
+# --nproc 表示 8 个转换进程，也可以省略。
+python tools/dataset_converters/voc_aug.py data/VOCdevkit data/VOCdevkit/VOCaug --nproc 8
+```
+
+请参考[拼接数据集文档](../advanced_guides/add_datasets.md#拼接数据集)及 [voc_aug 配置示例](../../../configs/_base_/datasets/pascal_voc12_aug.py)以详细了解如何将它们拼接并合并训练。
+
+## ADE20K
+
+ADE20K 的训练和验证集可以从这个[链接](http://data.csail.mit.edu/places/ADEchallenge/ADEChallengeData2016.zip)下载。
+如果需要下载测试数据集，可以在[官网](http://host.robots.ox.ac.uk/)注册后，下载[测试集](http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2010test.tar)。
+
+## Pascal Context
+
+Pascal Context 的训练和验证集可以从[此处](http://host.robots.ox.ac.uk/pascal/VOC/voc2010/VOCtrainval_03-May-2010.tar)下载。注册后，您也可以从[此处](http://host.robots.ox.ac.uk:8080/eval/downloads/VOC2010test.tar)下载测试集。
+
+从原始数据集中抽出部分数据作为验证集，您可以从[此处](https://codalabuser.blob.core.windows.net/public/trainval_merged.json)下载 trainval_merged.json 文件。
+
+请先安装 [Detail](https://github.com/zhanghang1989/detail-api) 工具然后运行以下命令将标注转换为正确的格式。
+
+```shell
+python tools/dataset_converters/pascal_context.py data/VOCdevkit data/VOCdevkit/VOC2010/trainval_merged.json
+```
+
+## COCO Stuff 10k
+
+数据可以通过 wget 在[这里](http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip)下载。
+
+对于 COCO Stuff 10k 数据集，请运行以下命令下载并转换数据集。
+
+```shell
+# 下载
+mkdir coco_stuff10k && cd coco_stuff10k
+wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/cocostuff-10k-v1.1.zip
+
+# 解压
+unzip cocostuff-10k-v1.1.zip
+
+# --nproc 表示 8 个转换进程，也可以省略。
+python tools/dataset_converters/coco_stuff10k.py /path/to/coco_stuff10k --nproc 8
+```
+
+按照惯例，`/path/to/coco_stuff164k/annotations/*2014/*_labelTrainIds.png` 中的 mask 标注用于 COCO Stuff 10k 的训练和测试。
+
+## COCO Stuff 164k
+
+对于 COCO Stuff 164k 数据集，请运行以下命令下载并转换增强的数据集。
+
+```shell
+# 下载
+mkdir coco_stuff164k && cd coco_stuff164k
+wget http://images.cocodataset.org/zips/train2017.zip
+wget http://images.cocodataset.org/zips/val2017.zip
+wget http://calvin.inf.ed.ac.uk/wp-content/uploads/data/cocostuffdataset/stuffthingmaps_trainval2017.zip
+
+# 解压
+unzip train2017.zip -d images/
+unzip val2017.zip -d images/
+unzip stuffthingmaps_trainval2017.zip -d annotations/
+
+# --nproc 表示 8 个转换进程，也可以省略。
+python tools/dataset_converters/coco_stuff164k.py /path/to/coco_stuff164k --nproc 8
+```
+
+按照惯例，`/path/to/coco_stuff164k/annotations/*2017/*_labelTrainIds.png` 中的 mask 标注用于 COCO Stuff 164k 的训练和测试。
+
+此数据集的详细信息可在[此处](https://github.com/nightrome/cocostuff#downloads)找到。
+
+## CHASE DB1
+
+CHASE DB1 的训练和验证集可以从[此处](https://staffnet.kingston.ac.uk/~ku15565/CHASE_DB1/assets/CHASEDB1.zip)下载。
+
+请运行以下命令，准备 CHASE DB1 数据集：
+
+```shell
+python tools/dataset_converters/chase_db1.py /path/to/CHASEDB1.zip
+```
+
+该脚本将自动调整数据集目录结构，使其满足 MMSegmentation 数据集加载要求。
+
+## DRIVE
+
+按照[官网](https://drive.grand-challenge.org/)要求，注册并登陆后，便可以下载 DRIVE 的训练和验证数据集。
+
+要将 DRIVE 数据集转换为 MMSegmentation 的格式，请运行以下命令：
+
+```shell
+python tools/dataset_converters/drive.py /path/to/training.zip /path/to/test.zip
+```
+
+该脚本将自动调整数据集目录结构，使其满足 MMSegmentation 数据集加载要求。
+
+## HRF
+
+请下载 [health.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/healthy.zip)、[glaucoma.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/glaucoma.zip)、[diabetic_retinopathy.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/diabetic_retinopathy.zip)、[healthy_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/healthy_manualsegm.zip)、[glaucoma_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/glaucoma_manualsegm.zip) 和 [diabetic_retinopathy_manualsegm.zip](https://www5.cs.fau.de/fileadmin/research/datasets/fundus-images/diabetic_retinopathy_manualsegm.zip)，无需解压，可以直接运行以下命令，准备 HRF 数据集：
+
+```shell
+python tools/dataset_converters/hrf.py /path/to/healthy.zip /path/to/healthy_manualsegm.zip /path/to/glaucoma.zip /path/to/glaucoma_manualsegm.zip /path/to/diabetic_retinopathy.zip /path/to/diabetic_retinopathy_manualsegm.zip
+```
+
+该脚本将自动调整数据集目录结构，使其满足 MMSegmentation 数据集加载要求。
+
+## STARE
+
+请下载 [stare images.tar](http://cecas.clemson.edu/~ahoover/stare/probing/stare-images.tar)、[labels-ah.tar](http://cecas.clemson.edu/~ahoover/stare/probing/labels-ah.tar) 和 [labels-vk.tar](http://cecas.clemson.edu/~ahoover/stare/probing/labels-vk.tar)，无需解压，可以直接运行以下命令，准备 STARE 数据集：
+
+```shell
+python tools/dataset_converters/stare.py /path/to/stare-images.tar /path/to/labels-ah.tar /path/to/labels-vk.tar
+```
+
+该脚本将自动调整数据集目录结构，使其满足 MMSegmentation 数据集加载要求。
+
+## Dark Zurich
+
+由于我们只支持在此数据集上的模型测试，因此您只需要下载并解压[验证数据集](https://data.vision.ee.ethz.ch/csakarid/shared/GCMA_UIoU/Dark_Zurich_val_anon.zip)。
+
+## Nighttime Driving
+
+由于我们只支持在此数据集上的模型测试，因此您只需要下载并解压[验证数据集](http://data.vision.ee.ethz.ch/daid/NighttimeDriving/NighttimeDrivingTest.zip)。
+
+## LoveDA
+
+数据可以从[此处](https://drive.google.com/drive/folders/1ibYV0qwn4yuuh068Rnc-w4tPi0U0c-ti?usp=sharing)下载 LaveDA 数据集。
+
+或者可以从 [zenodo](https://zenodo.org/record/5706578#.YZvN7SYRXdF) 下载。下载后，无需解压，直接运行以下命令：
+
+```shell
+# 下载 Train.zip
+wget https://zenodo.org/record/5706578/files/Train.zip
+# 下载 Val.zip
+wget https://zenodo.org/record/5706578/files/Val.zip
+# 下载 Test.zip
+wget https://zenodo.org/record/5706578/files/Test.zip
+```
+
+请对于 LoveDA 数据集，请运行以下命令调整数据集目录。
+
+```shell
+python tools/dataset_converters/loveda.py /path/to/loveDA
+```
+
+可将模型对 LoveDA 的测试集的预测结果上传至到数据集[测试服务器](https://codalab.lisn.upsaclay.fr/competitions/421)，查看评测结果。
+
+有关 LoveDA 的更多详细信息，可查看[此处](https://github.com/Junjue-Wang/LoveDA).
+
+## ISPRS Potsdam
+
+[Potsdam](https://www.isprs.org/education/benchmarks/UrbanSemLab/2d-sem-label-potsdam.aspx) 城市语义分割数据集用于 2D 语义分割竞赛 —— Potsdam。
+
+数据集可以在竞赛[主页](https://www.isprs.org/education/benchmarks/UrbanSemLab/default.aspx)上请求获得。
+这里也提供了[BaiduNetdisk](https://pan.baidu.com/s/1K-cLVZnd1X7d8c26FQ-nGg?pwd=mseg)，提取码：mseg、 [Google Drive](https://drive.google.com/drive/folders/1w3EJuyUGet6_qmLwGAWZ9vw5ogeG0zLz?usp=sharing)以及[OpenDataLab](https://opendatalab.com/ISPRS_Potsdam/download)。
+实验中需要下载 '2_Ortho_RGB.zip' 和 '5_Labels_all_noBoundary.zip'。
+
+对于 Potsdam 数据集，请运行以下命令调整数据集目录。
+
+```shell
+python tools/dataset_converters/potsdam.py /path/to/potsdam
+```
+
+在我们的默认设置中，将生成 3456 张图像用于训练和 2016 张图像用于验证。
+
+## ISPRS Vaihingen
+
+[Vaihingen](https://www.isprs.org/education/benchmarks/UrbanSemLab/2d-sem-label-vaihingen.aspx) 城市语义分割数据集用于 2D 语义分割竞赛 —— Vaihingen。
+
+数据集可以在竞赛[主页](https://www.isprs.org/education/benchmarks/UrbanSemLab/default.aspx)上请求获得。
+这里也提供了[BaiduNetdisk](https://pan.baidu.com/s/109D3WLrLafsuYtLeerLiiA?pwd=mseg)，提取码：mseg 、 [Google Drive](https://drive.google.com/drive/folders/1w3NhvLVA2myVZqOn2pbiDXngNC7NTP_t?usp=sharing)。
+实验中需要下载 'ISPRS_semantic_labeling_Vaihingen.zip' 和 'ISPRS_semantic_labeling_Vaihingen_ground_truth_eroded_COMPLETE.zip'。
+
+对于 Vaihingen 数据集，请运行以下命令调整数据集目录。
+
+```shell
+python tools/dataset_converters/vaihingen.py /path/to/vaihingen
+```
+
+在我们的默认设置（`clip_size`=512, `stride_size`=256）中，将生成 344 张图像用于训练和 398 张图像用于验证。
+
+## iSAID
+
+iSAID 数据集可从 [DOTA-v1.0](https://captain-whu.github.io/DOTA/dataset.html) 下载训练/验证/测试数据集的图像数据，
+
+并从 [iSAID](https://captain-whu.github.io/iSAID/dataset.html)下载训练/验证数据集的标注数据。
+
+该数据集是航空图像实例分割和语义分割任务的大规模数据集。
+
+下载 iSAID 数据集后，您可能需要按照以下结构进行数据集准备。
+
+```none
+├── data
+│   ├── iSAID
+│   │   ├── train
+│   │   │   ├── images
+│   │   │   │   ├── part1.zip
+│   │   │   │   ├── part2.zip
+│   │   │   │   ├── part3.zip
+│   │   │   ├── Semantic_masks
+│   │   │   │   ├── images.zip
+│   │   ├── val
+│   │   │   ├── images
+│   │   │   │   ├── part1.zip
+│   │   │   ├── Semantic_masks
+│   │   │   │   ├── images.zip
+│   │   ├── test
+│   │   │   ├── images
+│   │   │   │   ├── part1.zip
+│   │   │   │   ├── part2.zip
+```
+
+```shell
+python tools/dataset_converters/isaid.py /path/to/iSAID
+```
+
+在我们的默认设置（`patch_width`=896, `patch_height`=896, `overlap_area`=384）中，将生成 33978 张图像用于训练和 11644 张图像用于验证。
+
+## LIP(Look Into Person) dataset
+
+该数据集可以从[此页面](https://lip.sysuhcp.com/overview.php)下载。
+
+请运行以下命令来解压数据集。
+
+```shell
+unzip LIP.zip
+cd LIP
+unzip TrainVal_images.zip
+unzip TrainVal_parsing_annotations.zip
+cd TrainVal_parsing_annotations
+unzip TrainVal_parsing_annotations.zip
+mv train_segmentations ../
+mv val_segmentations ../
+cd ..
+```
+
+LIP 数据集的内容包括：
+
+```none
+├── data
+│   ├── LIP
+│   │   ├── train_images
+│   │   │   ├── 1000_1234574.jpg
+│   │   │   ├── ...
+│   │   ├── train_segmentations
+│   │   │   ├── 1000_1234574.png
+│   │   │   ├── ...
+│   │   ├── val_images
+│   │   │   ├── 100034_483681.jpg
+│   │   │   ├── ...
+│   │   ├── val_segmentations
+│   │   │   ├── 100034_483681.png
+│   │   │   ├── ...
+```
+
+## Synapse dataset
+
+此数据集可以从[此页面](https://www.synapse.org/#!Synapse:syn3193805/wiki/)下载。
+
+遵循 [TransUNet](https://arxiv.org/abs/2102.04306) 的数据准备设定，将原始训练集（30 次扫描）拆分为新的训练集（18 次扫描）和验证集（12 次扫描）。请运行以下命令来准备数据集。
+
+```shell
+unzip RawData.zip
+cd ./RawData/Training
+```
+
+然后创建 `train.txt` 和 `val.txt` 以拆分数据集。
+
+根据 TransUnet，以下是数据集的划分。
+
+train.txt
+
+```none
+img0005.nii.gz
+img0006.nii.gz
+img0007.nii.gz
+img0009.nii.gz
+img0010.nii.gz
+img0021.nii.gz
+img0023.nii.gz
+img0024.nii.gz
+img0026.nii.gz
+img0027.nii.gz
+img0028.nii.gz
+img0030.nii.gz
+img0031.nii.gz
+img0033.nii.gz
+img0034.nii.gz
+img0037.nii.gz
+img0039.nii.gz
+img0040.nii.gz
+```
+
+val.txt
+
+```none
+img0008.nii.gz
+img0022.nii.gz
+img0038.nii.gz
+img0036.nii.gz
+img0032.nii.gz
+img0002.nii.gz
+img0029.nii.gz
+img0003.nii.gz
+img0001.nii.gz
+img0004.nii.gz
+img0025.nii.gz
+img0035.nii.gz
+```
+
+synapse 数据集的内容包括：
+
+```none
+├── Training
+│   ├── img
+│   │   ├── img0001.nii.gz
+│   │   ├── img0002.nii.gz
+│   │   ├── ...
+│   ├── label
+│   │   ├── label0001.nii.gz
+│   │   ├── label0002.nii.gz
+│   │   ├── ...
+│   ├── train.txt
+│   ├── val.txt
+```
+
+然后，使用此命令转换 synapse 数据集。
+
+```shell
+python tools/dataset_converters/synapse.py --dataset-path /path/to/synapse
+```
+
+注意，MMSegmentation 的默认评估指标（例如 mean dice value）是在 2D 切片图像上计算的，这与 [TransUNet](https://arxiv.org/abs/2102.04306) 等一些论文中的 3D 扫描结果是不同的。
+
+## REFUGE
+
+在 [REFUGE Challenge](https://refuge.grand-challenge.org) 官网上注册并下载 [REFUGE 数据集](https://refuge.grand-challenge.org/REFUGE2Download)。
+
+然后，解压 `REFUGE2.zip`，原始数据集的内容包括：
+
+```none
+├── REFUGE2
+│   ├── REFUGE2
+│   │   ├── Annotation-Training400.zip
+│   │   ├── REFUGE-Test400.zip
+│   │   ├── REFUGE-Test-GT.zip
+│   │   ├── REFUGE-Training400.zip
+│   │   ├── REFUGE-Validation400.zip
+│   │   ├── REFUGE-Validation400-GT.zip
+│   ├── __MACOSX
+```
+
+请运行以下命令转换 REFUGE 数据集：
+
+```shell
+python tools/convert_datasets/refuge.py --raw_data_root=/path/to/refuge/REFUGE2/REFUGE2
+```
+
+脚本会将目录结构转换如下：
+
+```none
+│   ├── REFUGE
+│   │   ├── images
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── test
+│   │   ├── annotations
+│   │   │   ├── training
+│   │   │   ├── validation
+│   │   │   ├── test
+```
+
+包含 400 张用于训练的图像、400 张用于验证的图像和 400 张用于测试的图像，这与 REFUGE 2018 数据集相同。
+
+## Mapillary Vistas Datasets
+
+- Mapillary Vistas [官方网站](https://www.mapillary.com/dataset/vistas) 可以下载 Mapillary Vistas 数据集，按照官网要求注册并登陆后，数据可以在[这里](https://www.mapillary.com/dataset/vistas)找到。
+
+- Mapillary Vistas 数据集使用 8-bit with color-palette 来存储标签。不需要进行转换操作。
+
+- 假设您已将数据集 zip 文件放在 `mmsegmentation/data/mapillary` 中
+
+- 请运行以下命令来解压数据集。
+
+  ```bash
+  cd data/mapillary
+  unzip An-ZjB1Zm61yAZG0ozTymz8I8NqI4x0MrYrh26dq7kPgfu8vf9ImrdaOAVOFYbJ2pNAgUnVGBmbue9lTgdBOb5BbKXIpFs0fpYWqACbrQDChAA2fdX0zS9PcHu7fY8c-FOvyBVxPNYNFQuM.zip
+  ```
+
+- 解压后，您将获得类似于此结构的 Mapillary Vistas 数据集。语义分割 mask 标签在 `labels` 文件夹中。
+
+  ```none
+  mmsegmentation
+  ├── mmseg
+  ├── tools
+  ├── configs
+  ├── data
+  │   ├── mapillary
+  │   │   ├── training
+  │   │   │   ├── images
+  │   │   │   ├── v1.2
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   └── panoptic
+  │   │   │   ├── v2.0
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── panoptic
+  |   │   │   │   └── polygons
+  │   │   ├── validation
+  │   │   │   ├── images
+  |   │   │   ├── v1.2
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   └── panoptic
+  │   │   │   ├── v2.0
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── panoptic
+  |   │   │   │   └── polygons
+  ```
+
+- 您可以在配置中使用 `MapillaryDataset_v1` 和 `Mapillary Dataset_v2` 设置数据集版本。
+  在此处 [V1.2](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/_base_/datasets/mapillary_v1.py) 和 [V2.0](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/_base_/datasets/mapillary_v2.py) 查看 Mapillary Vistas 数据集配置文件
+
+## LEVIR-CD
+
+[LEVIR-CD](https://justchenhao.github.io/LEVIR/) 大规模遥感建筑变化检测数据集。
+
+数据集可以在[主页](https://justchenhao.github.io/LEVIR/)上请求获得。
+
+数据集的补充版本可以在[主页](https://github.com/S2Looking/Dataset)上请求获得。
+
+请下载数据集的补充版本，然后解压 `LEVIR-CD+.zip`，数据集的内容包括：
+
+```none
+│   ├── LEVIR-CD+
+│   │   ├── train
+│   │   │   ├── A
+│   │   │   ├── B
+│   │   │   ├── label
+│   │   ├── test
+│   │   │   ├── A
+│   │   │   ├── B
+│   │   │   ├── label
+```
+
+对于 LEVIR-CD 数据集，请运行以下命令无重叠裁剪影像：
+
+```shell
+python tools/dataset_converters/levircd.py --dataset-path /path/to/LEVIR-CD+ --out_dir /path/to/LEVIR-CD
+```
+
+裁剪后的影像大小为256x256，与原论文保持一致。
+
+## BDD100K
+
+- 可以从[官方网站](https://bdd-data.berkeley.edu/) 下载 BDD100K数据集（语义分割任务主要是10K数据集），按照官网要求注册并登陆后，数据可以在[这里](https://bdd-data.berkeley.edu/portal.html#download)找到。
+
+- 图像数据对应的名称是是`10K Images`, 语义分割标注对应的名称是`Segmentation`
+
+- 下载后，可以使用以下代码进行解压
+
+  ```bash
+  unzip ~/bdd100k_images_10k.zip -d ~/mmsegmentation/data/
+  unzip ~/bdd100k_sem_seg_labels_trainval.zip -d ~/mmsegmentation/data/
+  ```
+
+就可以得到以下文件结构了：
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── bdd100k
+│   │   ├── images
+│   │   │   └── 10k
+|   │   │   │   ├── test
+|   │   │   │   ├── train
+|   │   │   │   └── val
+│   │   └── labels
+│   │   │   └── sem_seg
+|   │   │   │   ├── colormaps
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── masks
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── polygons
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+|   │   │   │   └── rles
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+```
+
+## NYU
+
+- 您可以从 [这个链接](https://drive.google.com/file/d/1wC-io-14RCIL4XTUrQLk6lBqU2AexLVp/view?usp=share_link) 下载 NYU 数据集
+
+- 下载完成后，您可以使用 [tools/dataset_converters/nyu.py](/tools/dataset_converters/nyu.py) 脚本来解压和组织数据到所需的格式
+
+  ```bash
+  python tools/dataset_converters/nyu.py nyu.zip
+  ```
diff --git a/docs/zh_cn/user_guides/3_inference.md b/docs/zh_cn/user_guides/3_inference.md
new file mode 100644
index 0000000000..0afcb4b05d
--- /dev/null
+++ b/docs/zh_cn/user_guides/3_inference.md
@@ -0,0 +1,244 @@
+# 教程3：使用预训练模型推理
+
+MMSegmentation 在 [Model Zoo](../Model_Zoo.md) 中为语义分割提供了预训练的模型，并支持多个标准数据集，包括 Cityscapes、ADE20K 等。
+本说明将展示如何使用现有模型对给定图像进行推理。
+关于如何在标准数据集上测试现有模型，请参阅本[指南](./4_train_test.md)
+
+MMSegmentation 为用户提供了数个接口，以便轻松使用预训练的模型进行推理。
+
+- [教程3：使用预训练模型推理](#教程3使用预训练模型推理)
+  - [推理器](#推理器)
+    - [基本使用](#基本使用)
+    - [初始化](#初始化)
+    - [可视化预测结果](#可视化预测结果)
+    - [模型列表](#模型列表)
+  - [推理 API](#推理-api)
+    - [mmseg.apis.init_model](#mmsegapisinit_model)
+    - [mmseg.apis.inference_model](#mmsegapisinference_model)
+    - [mmseg.apis.show_result_pyplot](#mmsegapisshow_result_pyplot)
+
+## 推理器
+
+在 MMSegmentation 中，我们提供了最**方便的**方式 `MMSegInferencer` 来使用模型。您只需 3 行代码就可以获得图像的分割掩膜。
+
+### 基本使用
+
+以下示例展示了如何使用 `MMSegInferencer` 对单个图像执行推理。
+
+```
+>>> from mmseg.apis import MMSegInferencer
+>>> # 将模型加载到内存中
+>>> inferencer = MMSegInferencer(model='deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024')
+>>> # 推理
+>>> inferencer('demo/demo.png', show=True)
+```
+
+可视化结果应如下所示：
+
+<div align="center">
+    <img src='https://user-images.githubusercontent.com/76149310/221507927-ae01e3a7-016f-4425-b966-7b19cbbe494e.png' />
+</div>
+
+此外，您可以使用 `MMSegInferencer` 来处理一个包含多张图片的 `list`：
+
+```
+# 输入一个图片 list
+>>> images = [image1, image2, ...] # image1 可以是文件路径或 np.ndarray
+>>> inferencer(images, show=True, wait_time=0.5) # wait_time 是延迟时间，0 表示无限
+
+# 或输入图像目录
+>>> images = $IMAGESDIR
+>>> inferencer(images, show=True, wait_time=0.5)
+
+# 保存可视化渲染彩色分割图和预测结果
+# out_dir 是保存输出结果的目录，img_out_dir 和 pred_out_dir 为 out_dir 的子目录
+# 以保存可视化渲染彩色分割图和预测结果
+>>> inferencer(images, out_dir='outputs', img_out_dir='vis', pred_out_dir='pred')
+```
+
+推理器有一个可选参数 `return_datasamples`，其默认值为 False，推理器的返回值默认为 `dict` 类型，包括 'visualization' 和 'predictions' 两个 key。
+如果 `return_datasamples=True` 推理器将返回 [`SegDataSample`](../advanced_guides/structures.md) 或其列表。
+
+```
+result = inferencer('demo/demo.png')
+# 结果是一个包含 'visualization' 和 'predictions' 两个 key 的 `dict`
+# 'visualization' 包含彩色分割图
+print(result['visualization'].shape)
+# (512, 683, 3)
+
+# 'predictions' 包含带有标签索引的分割掩膜
+print(result['predictions'].shape)
+# (512, 683)
+
+result = inferencer('demo/demo.png', return_datasamples=True)
+print(type(result))
+# <class 'mmseg.structures.seg_data_sample.SegDataSample'>
+
+# 输入一个图片 list
+results = inferencer(images)
+# 输出为列表
+print(type(results['visualization']), results['visualization'][0].shape)
+# <class 'list'> (512, 683, 3)
+print(type(results['predictions']), results['predictions'][0].shape)
+# <class 'list'> (512, 683)
+
+results = inferencer(images, return_datasamples=True)
+# <class 'list'>
+print(type(results[0]))
+# <class 'mmseg.structures.seg_data_sample.SegDataSample'>
+```
+
+### 初始化
+
+`MMSegInferencer` 必须使用 `model` 初始化，该 `model` 可以是模型名称或一个 `Config`，甚至可以是配置文件的路径。
+模型名称可以在模型的元文件（configs/xxx/metafile.yaml）中找到，比如 maskformer 的一个模型名称是 `maskformer_r50-d32_8xb2-160k_ade20k-512x512`，如果输入模型名称，模型的权重将自动下载。以下是其他输入参数：
+
+- weights（str，可选）- 权重的路径。如果未指定，并且模型是元文件中的模型名称，则权重将从元文件加载。默认为 None。
+- classes（list，可选）- 输入类别用于结果渲染，由于分割模型的预测结构是标签索引的分割图，`classes` 是一个相应的标签索引的类别列表。若 classes 没有定义，可视化工具将默认使用 `cityscapes` 的类别。默认为 None。
+- palette（list，可选）- 输入调色盘用于结果渲染，它是对应分类的配色列表。若 palette 没有定义，可视化工具将默认使用 `cityscapes` 的调色盘。默认为 None。
+- dataset_name（str，可选）- [数据集名称或别名](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317)，可视化工具将使用数据集的元信息，如类别和配色，但 `classes` 和 `palette` 具有更高的优先级。默认为 None。
+- device（str，可选）- 运行推理的设备。如果无，则会自动使用可用的设备。默认为 None。
+- scope（str，可选）- 模型的作用域。默认为 'mmseg'。
+
+### 可视化预测结果
+
+`MMSegInferencer` 有4个用于可视化预测的参数，您可以在初始化推理器时使用它们：
+
+- show（bool）- 是否弹出窗口显示图像。默认为 False。
+- wait_time（float）- 显示的间隔。默认值为 0。
+- img_out_dir（str）- `out_dir` 的子目录，用于保存渲染有色分割掩膜，因此如果要保存预测掩膜，则必须定义 `out_dir`。默认为 `vis`。
+- opacity（int，float）- 分割掩膜的透明度。默认值为 0.8。
+
+这些参数的示例请参考[基本使用](#基本使用)
+
+### 模型列表
+
+在 MMSegmentation 中有一个非常容易列出所有模型名称的方法
+
+```
+>>> from mmseg.apis import MMSegInferencer
+# models 是一个模型名称列表，它们将自动打印
+>>> models = MMSegInferencer.list_models('mmseg')
+```
+
+## 推理 API
+
+### mmseg.apis.init_model
+
+从配置文件初始化一个分割器。
+
+参数：
+
+- config（str，`Path` 或 `mmengine.Config`）- 配置文件路径或配置对象。
+- checkpoint（str，可选）- 权重路径。如果为 None，则模型将不会加载任何权重。
+- device（str，可选）- CPU/CUDA 设备选项。默认为 'cuda:0'。
+- cfg_options（dict，可选）- 用于覆盖所用配置中的某些设置的选项。
+
+返回值：
+
+- nn.Module：构建好的分割器。
+
+示例：
+
+```python
+from mmseg.apis import init_model
+
+config_path = 'configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+checkpoint_path = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
+
+# 初始化不带权重的模型
+model = init_model(config_path)
+
+# 初始化模型并加载权重
+model = init_model(config_path, checkpoint_path)
+
+# 在 CPU 上的初始化模型并加载权重
+model = init_model(config_path, checkpoint_path, 'cpu')
+```
+
+### mmseg.apis.inference_model
+
+使用分割器推理图像。
+
+参数：
+
+- model（nn.Module）- 加载的分割器
+- imgs（str，np.ndarray 或 list\[str/np.ndarray\]）- 图像文件或加载的图像
+
+返回值：
+
+- `SegDataSample` 或 list\[`SegDataSample`\]：如果 imgs 是列表或元组，则返回相同长度的列表类型结果，否则直接返回分割结果。
+
+**注意：** [SegDataSample](https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/structures/seg_data_sample.py) 是 MMSegmentation 的数据结构接口，用作不同组件之间的接口。`SegDataSample` 实现抽象数据元素 `mmengine.structures.BaseDataElement`，请参阅 [MMEngine](https://github.com/open-mmlab/mmengine) 中的数据元素[文档](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/data_element.html)了解更多信息。
+
+`SegDataSample` 中的参数分为几个部分：
+
+- `gt_sem_seg`（`PixelData`）- 语义分割的标注。
+- `pred_sem_seg`（`PixelData`）- 语义分割的预测。
+- `seg_logits`（`PixelData`）- 模型最后一层的输出结果。
+
+**注意：** [PixelData](https://github.com/open-mmlab/mmengine/blob/main/mmengine/structures/pixel_data.py) 是像素级标注或预测的数据结构，请参阅 [MMEngine](https://github.com/open-mmlab/mmengine) 中的 PixelData [文档](https://mmengine.readthedocs.io/en/latest/advanced_tutorials/data_element.html)了解更多信息。
+
+示例：
+
+```python
+from mmseg.apis import init_model, inference_model
+
+config_path = 'configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+checkpoint_path = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
+img_path = 'demo/demo.png'
+
+
+model = init_model(config_path, checkpoint_path)
+result = inference_model(model, img_path)
+```
+
+### mmseg.apis.show_result_pyplot
+
+在图像上可视化分割结果。
+
+参数：
+
+- model（nn.Module）- 加载的分割器。
+- img（str 或 np.ndarray）- 图像文件名或加载的图像。
+- result（`SegDataSample`）- SegDataSample 预测结果。
+- opacity（float）- 绘制分割图的不透明度。默认值为 `0.5`，必须在 `(0，1]` 范围内。
+- title（str）- pyplot 图的标题。默认值为 ''。
+- draw_gt（bool）- 是否绘制 GT SegDataSample。默认为 `True`。
+- draw_pred（draws_pred）- 是否绘制预测 SegDataSample。默认为 `True`。
+- wait_time（float）- 显示的间隔，0 是表示“无限”的特殊值。默认为 `0`。
+- show（bool）- 是否展示绘制的图像。默认为 `True`。
+- save_dir（str，可选）- 为所有存储后端保存的文件路径。如果为 `None`，则后端存储将不会保存任何数据。
+- out_file（str，可选）- 输出文件的路径。默认为 `None`。
+
+返回值：
+
+- np.ndarray：通道为 RGB 的绘制图像。
+
+示例：
+
+```python
+from mmseg.apis import init_model, inference_model, show_result_pyplot
+
+config_path = 'configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py'
+checkpoint_path = 'checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth'
+img_path = 'demo/demo.png'
+
+
+# 从配置文件和权重文件构建模型
+model = init_model(config_path, checkpoint_path, device='cuda:0')
+
+# 推理给定图像
+result = inference_model(model, img_path)
+
+# 展示分割结果
+vis_image = show_result_pyplot(model, img_path, result)
+
+# 保存可视化结果，输出图像将在 `workdirs/result.png` 路径下找到
+vis_iamge = show_result_pyplot(model, img_path, result, out_file='work_dirs/result.png')
+
+# 修改展示图像的时间，注意 0 是表示“无限”的特殊值
+vis_image = show_result_pyplot(model, img_path, result, wait_time=5)
+```
+
+**注意：** 如果当前设备没有图形用户界面，建议将 `show` 设置为 `False`，并指定 `out_file` 或 `save_dir` 来保存结果。如果您想在窗口上显示结果，则不需要特殊设置。
diff --git a/docs/zh_cn/user_guides/4_train_test.md b/docs/zh_cn/user_guides/4_train_test.md
new file mode 100644
index 0000000000..f821acaf52
--- /dev/null
+++ b/docs/zh_cn/user_guides/4_train_test.md
@@ -0,0 +1,317 @@
+# 教程4：使用现有模型进行训练和测试
+
+MMSegmentation 支持在多种设备上训练和测试模型。如下文，具体方式分别为单GPU、分布式以及计算集群的训练和测试。通过本教程，您将知晓如何用 MMSegmentation 提供的脚本进行训练和测试。
+
+## 在单GPU上训练和测试
+
+### 在单GPU上训练
+
+`tools/train.py` 文件提供了在单GPU上部署训练任务的方法。
+
+基础用法如下:
+
+```shell
+python tools/train.py  ${配置文件} [可选参数]
+```
+
+- `--work-dir ${工作路径}`: 重新指定工作路径
+- `--amp`: 使用自动混合精度计算
+- `--resume`: 从工作路径中保存的最新检查点文件（checkpoint）恢复训练
+- `--cfg-options ${需更覆盖的配置}`: 覆盖已载入的配置中的部分设置，并且 以 xxx=yyy 格式的键值对 将被合并到配置文件中。
+  比如： '--cfg-option model.encoder.in_channels=6'， 更多细节请看[指导](./1_config.md#Modify-config-through-script-arguments)。
+
+下面是对于多GPU测试的可选参数:
+
+- `--launcher`: 执行器的启动方式。允许选择的参数值有 `none`, `pytorch`, `slurm`, `mpi`。特别的，如果设置为none，测试将非分布式模式下进行。
+- `--local_rank`: 分布式中进程的序号。如果没有指定，默认设置为0。
+
+**注意：** 命令行参数 `--resume` 和在配置文件中的参数 `load_from` 的不同之处：
+
+`--resume` 只决定是否继续使用工作路径中最新的检查点，它常常用于恢复被意外打断的训练。
+
+`load_from` 会明确指定被载入的检查点文件，且训练迭代器将从0开始，通常用于微调模型。
+
+如果您希望从指定的检查点上恢复训练您可以使用：
+
+```python
+python tools/train.py ${配置文件} --resume --cfg-options load_from=${检查点}
+```
+
+**在 CPU 上训练**: 如果机器没有 GPU，则在 CPU 上训练的过程是与单GPU训练一致的。如果机器有 GPU 但是不希望使用它们，我们只需要在训练前通过以下方式关闭 GPU 训练功能。
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+```
+
+然后运行[上方](###在单GPU上训练)脚本。
+
+### 在单GPU上测试
+
+`tools/test.py` 文件提供了在单 GPU 上启动测试任务的方法。
+
+基础用法如下:
+
+```shell
+python tools/test.py ${配置文件} ${模型权重文件} [可选参数]
+```
+
+这个工具有几个可选参数，包括：
+
+- `--work-dir`: 如果指定了路径，结果会保存在该路径下。如果没有指定则会保存在 `work_dirs/{配置文件名}` 路径下.
+- `--show`: 当 `--show-dir` 没有指定时，可以使用该参数，在程序运行过程中显示预测结果。
+- `--show-dir`: 绘制了分割掩膜图片的存储文件夹。如果指定了该参数，则可视化的分割掩膜将被保存到 `work_dir/timestamp/{指定路径}`.
+- `--wait-time`: 多次可视化结果的时间间隔。当 `--show` 为激活状态时发挥作用。默认为2。
+- `--cfg-options`:  如果被具体指定，以 xxx=yyy 形式的键值对将被合并入配置文件中。
+
+**在CPU上测试**: 如果机器没有GPU，则在CPU上训练的过程是与单GPU训练一致的。如果机器有GPU，但是不希望使用它们，我们只需要在训练前通过以下方式关闭GPUs训练功能。
+
+```shell
+export CUDA_VISIBLE_DEVICES=-1
+```
+
+然后运行[上方](###在单GPU上测试)脚本。
+
+## 多GPU、多机器上训练和测试
+
+### 在多GPU上训练
+
+OpenMMLab2.0 通过 `MMDistributedDataParallel`实现 **分布式** 训练。
+
+`tools/dist_train.sh` 文件提供了在在多GPU上部署训练任务的方法。
+
+基础用法如下:
+
+```shell
+sh tools/dist_train.sh ${配置文件} ${GPU数量} [可选参数]
+```
+
+可选参数与[上方](###在单GPU上训练)相同并且还增加了可以指定gpu数量的参数。
+
+示例:
+
+```shell
+# 模型训练的检查点和日志保存在这个路径下： WORK_DIR=work_dirs/pspnet_r50-d8_4xb4-80k_ade20k-512x512/
+# 如果工作路径没有被设定，它将会被自动生成。
+sh tools/dist_train.sh configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py 8 --work-dir work_dirs/pspnet_r50-d8_4xb4-80k_ade20k-512x512
+```
+
+**注意**: 在训练过程中，检查点和日志保存在`work_dirs/`下的配置文件的相同文件夹结构下。
+不推荐自定义的工作路径，因为评估脚本依赖于源自配置文件名的路径。如果您希望将权重保存在其他地方，请用符号链接，例如：
+
+```shell
+ln -s ${您的工作路径} ${MMSEG 路径}/work_dirs
+```
+
+### 在多GPU上测试
+
+`tools/dist_test.sh` 文件提供了在多GPU上启动测试任务的方法。
+
+基础用法如下:
+
+```shell
+sh tools/dist_test.sh ${配置文件} ${检查点文件} ${GPU数量} [可选参数]
+```
+
+可选参数与[上方](###在单GPU上测试)相同并且增加了可以指定 gpu 数量的参数。
+
+示例:
+
+```shell
+./tools/dist_test.sh configs/pspnet/pspnet_r50-d8_4xb2-40k_cityscapes-512x1024.py \
+    checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth 4
+```
+
+### 在单台机器上启动多个任务
+
+如果您在单个机器上运行多个任务，比如：在8卡GPU的单个机器上执行2个各需4卡GPU的训练任务，您需要为每个任务具体指定不同端口（默认29500），从而避免通讯冲突。否则，会有报错信息——`RuntimeError: Address already in use`（运行错误：地址被使用）。
+
+如果您使用 `dist_train.sh` 来启动训练任务，您可以通过环境变量 `PORT` 设置端口。
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 PORT=29500 sh tools/dist_train.sh ${配置文件} 4
+CUDA_VISIBLE_DEVICES=4,5,6,7 PORT=29501 sh tools/dist_train.sh ${配置文件} 4
+```
+
+### 在多台机器上训练
+
+MMSegmentation 的分布式训练依赖 `torch.distributed`。
+因此， 可以通过 PyTorch 的 [运行工具 launch utility](https://pytorch.org/docs/stable/distributed.html#launch-utility) 来进行分布式训练。
+
+如果您启动的多台机器简单地通过以太网连接，您可以直接运行下方命令：
+
+在第一个机器上:
+
+```shell
+NNODES=2 NODE_RANK=0 PORT=${主节点端口} MASTER_ADDR=${主节点地址} sh tools/dist_train.sh ${配置文件} ${GPUS}
+```
+
+在第二个机器上:
+
+```shell
+NNODES=2 NODE_RANK=1 PORT=${主节点端口} MASTER_ADDR=${主节点地址} sh tools/dist_train.sh ${配置文件} ${GPUS}
+```
+
+通常，如果您没有使用像无限带宽一类的高速网络，这个会过程比较慢。
+
+## 通过 Slurm 管理任务
+
+[Slurm](https://slurm.schedmd.com/) 是一个很好的计算集群作业调度系统。
+
+### 通过 Slurm 在集群上训练
+
+在一个由Slurm管理的集群上，您可以使用`slurm_train.sh`来启动训练任务。它同时支持单节点和多节点的训练。
+
+基础用法如下：
+
+```shell
+[GPUS=${GPUS}] sh tools/slurm_train.sh ${分区} ${任务名} ${配置文件} [可选参数]
+```
+
+下方是一个通过名为 `dev` 的 Slurm 分区，调用4个 GPU 来训练 PSPNet，并设置工作路径为共享文件系统。
+
+```shell
+GPUS=4 sh tools/slurm_train.sh dev pspnet configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py --work-dir work_dir/pspnet
+```
+
+您可以检查 [源码](../../../tools/slurm_train.sh) 来查看全部的参数和环境变量。
+
+### 通过 Slurm 在集群上测试
+
+与训练任务相同， MMSegmentation 提供 `slurm_test.sh` 文件来启动测试任务。
+
+基础用法如下：
+
+```shell
+[GPUS=${GPUS}] sh tools/slurm_test.sh ${分区} ${任务名} ${配置文件} ${检查点文件} [可选参数]
+```
+
+您可以通过 [源码](../../../tools/slurm_test.sh) 来查看全部的参数和环境变量。
+
+**注意：** 使用 Slurm 时，需要设置端口，可从以下方式中选取一种。
+
+1. 我们更推荐的通过`--cfg-options`设置端口，因为这不会改变原始配置：
+
+   ```shell
+   GPUS=4 GPUS_PER_NODE=4 sh tools/slurm_train.sh ${分区} ${任务名} config1.py ${工作路径} --cfg-options env_cfg.dist_cfg.port=29500
+   GPUS=4 GPUS_PER_NODE=4 sh tools/slurm_train.sh ${任务名} ${工作路径} config2.py ${工作路径} --cfg-options env_cfg.dist_cfg.port=29501
+   ```
+
+2. 通过修改配置文件设置不同的通讯端口：
+
+   在 `config1.py`中:
+
+   ```python
+   enf_cfg = dict(dist_cfg=dict(backend='nccl', port=29500))
+   ```
+
+   在 `config2.py`中：
+
+   ```python
+   enf_cfg = dict(dist_cfg=dict(backend='nccl', port=29501))
+   ```
+
+   然后您可以通过 config1.py 和 config2.py 同时启动两个任务：
+
+   ```shell
+   CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 sh tools/slurm_train.sh ${分区} ${任务名} config1.py ${工作路径}
+   CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 sh tools/slurm_train.sh ${分区} ${任务名} config2.py ${工作路径}
+   ```
+
+3. 在命令行中通过环境变量 `MASTER_PORT` 设置端口 ：
+
+```shell
+CUDA_VISIBLE_DEVICES=0,1,2,3 GPUS=4 MASTER_PORT=29500 sh tools/slurm_train.sh ${分区} ${任务名} config1.py ${工作路径}
+CUDA_VISIBLE_DEVICES=4,5,6,7 GPUS=4 MASTER_PORT=29501 sh tools/slurm_train.sh ${分区} ${任务名} config2.py ${工作路径}
+```
+
+## 测试并保存分割结果
+
+### 基础使用
+
+当需要保存测试输出的分割结果，用 `--out` 指定分割结果输出路径
+
+```shell
+python tools/test.py ${CONFIG_FILE} ${CHECKPOINT_FILE} --out ${OUTPUT_DIR}
+```
+
+以保存模型 `fcn_r50-d8_4xb4-80k_ade20k-512x512` 在 ADE20K 验证数据集上的结果为例：
+
+```shell
+python tools/test.py configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py ckpt/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth --out work_dirs/format_results
+```
+
+或者通过配置文件定义 `output_dir`。例如在 `configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py` 添加 `test_evaluator` 定义：
+
+```python
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'], output_dir='work_dirs/format_results')
+```
+
+然后执行相同功能的命令不需要再使用 `--out`：
+
+```shell
+python tools/test.py configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py ckpt/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth
+```
+
+当测试的数据集没有提供标注，评测时没有真值可以参与计算，因此需要设置 `format_only=True`，
+同时需要修改 `test_dataloader`，由于没有标注，我们需要在数据增强变换中删掉 `dict(type='LoadAnnotations')`，以下是一个配置示例：
+
+```python
+test_evaluator = dict(
+    type='IoUMetric',
+    iou_metrics=['mIoU'],
+    format_only=True,
+    output_dir='work_dirs/format_results')
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type = 'ADE20KDataset'
+        data_root='data/ade/release_test',
+        data_prefix=dict(img_path='testing'),
+        # 测试数据变换中没有加载标注
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+            dict(type='PackSegInputs')
+        ]))
+```
+
+然后执行测试命令：
+
+```shell
+python tools/test.py configs/fcn/fcn_r50-d8_4xb4-80k_ade20k-512x512.py ckpt/fcn_r50-d8_512x512_80k_ade20k_20200614_144016-f8ac5082.pth
+```
+
+### 测试 Cityscapes 数据集并保存输出分割结果
+
+推荐使用 `CityscapesMetric` 来保存模型在 Cityscapes 数据集上的测试结果，以下是一个配置示例：
+
+```python
+test_evaluator = dict(
+    type='CityscapesMetric',
+    format_only=True,
+    keep_results=True,
+    output_dir='work_dirs/format_results')
+test_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type='CityscapesDataset',
+        data_root='data/cityscapes/',
+        data_prefix=dict(img_path='leftImg8bit/test'),
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+            dict(type='PackSegInputs')
+        ]))
+```
+
+然后执行相同的命令，例如：
+
+```shell
+python tools/test.py configs/fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py ckpt/fcn_r18-d8_512x1024_80k_cityscapes_20201225_021327-6c50f8b4.pth
+```
diff --git a/docs/zh_cn/user_guides/5_deployment.md b/docs/zh_cn/user_guides/5_deployment.md
new file mode 100644
index 0000000000..b2bec02883
--- /dev/null
+++ b/docs/zh_cn/user_guides/5_deployment.md
@@ -0,0 +1,243 @@
+# 教程5：模型部署
+
+# MMSegmentation 模型部署
+
+- [教程5：模型部署](#教程5模型部署)
+- [MMSegmentation 模型部署](#mmsegmentation-模型部署)
+  - [安装](#安装)
+    - [安装 mmseg](#安装-mmseg)
+    - [安装 mmdeploy](#安装-mmdeploy)
+  - [模型转换](#模型转换)
+  - [模型规范](#模型规范)
+  - [模型推理](#模型推理)
+    - [后端模型推理](#后端模型推理)
+    - [SDK 模型推理](#sdk-模型推理)
+  - [模型支持列表](#模型支持列表)
+  - [注意事项](#注意事项)
+
+______________________________________________________________________
+
+[MMSegmentation](https://github.com/open-mmlab/mmsegmentation/tree/main) 又称`mmseg`，是一个基于 PyTorch 的开源对象分割工具箱。它是 [OpenMMLab](https://openmmlab.com/) 项目的一部分。
+
+## 安装
+
+### 安装 mmseg
+
+请参考[官网安装指南](https://mmsegmentation.readthedocs.io/en/latest/get_started.html)。
+
+### 安装 mmdeploy
+
+mmdeploy 有以下几种安装方式:
+
+**方式一：** 安装预编译包
+
+请参考[安装概述](https://mmdeploy.readthedocs.io/zh_CN/latest/get_started.html#mmdeploy)
+
+**方式二：** 一键式脚本安装
+
+如果部署平台是 **Ubuntu 18.04 及以上版本**， 请参考[脚本安装说明](../01-how-to-build/build_from_script.md)，完成安装过程。
+比如，以下命令可以安装 mmdeploy 以及配套的推理引擎——`ONNX Runtime`.
+
+```shell
+git clone --recursive -b main https://github.com/open-mmlab/mmdeploy.git
+cd mmdeploy
+python3 tools/scripts/build_ubuntu_x64_ort.py $(nproc)
+export PYTHONPATH=$(pwd)/build/lib:$PYTHONPATH
+export LD_LIBRARY_PATH=$(pwd)/../mmdeploy-dep/onnxruntime-linux-x64-1.8.1/lib/:$LD_LIBRARY_PATH
+```
+
+**说明**:
+
+- 把 `$(pwd)/build/lib` 添加到 `PYTHONPATH`，目的是为了加载 mmdeploy SDK python 包 `mmdeploy_runtime`，在章节 [SDK模型推理](#sdk模型推理)中讲述其用法。
+- 在[使用 ONNX Runtime推理后端模型](#后端模型推理)时，需要加载自定义算子库，需要把 ONNX Runtime 库的路径加入环境变量 `LD_LIBRARY_PATH`中。
+
+**方式三：** 源码安装
+
+在方式一、二都满足不了的情况下，请参考[源码安装说明](../01-how-to-build/build_from_source.md) 安装 mmdeploy 以及所需推理引擎。
+
+## 模型转换
+
+你可以使用 [tools/deploy.py](https://github.com/open-mmlab/mmdeploy/tree/main/tools/deploy.py) 把 mmseg 模型一键式转换为推理后端模型。
+该工具的详细使用说明请参考[这里](https://github.com/open-mmlab/mmdeploy/tree/main/docs/en/02-how-to-run/convert_model.md#usage).
+
+以下，我们将演示如何把 `unet` 转换为 onnx 模型。
+
+```shell
+cd mmdeploy
+
+# download unet model from mmseg model zoo
+mim download mmsegmentation --config unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024 --dest .
+
+# convert mmseg model to onnxruntime model with dynamic shape
+python tools/deploy.py \
+    configs/mmseg/segmentation_onnxruntime_dynamic.py \
+    unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py \
+    fcn_unet_s5-d16_4x4_512x1024_160k_cityscapes_20211210_145204-6860854e.pth \
+    demo/resources/cityscapes.png \
+    --work-dir mmdeploy_models/mmseg/ort \
+    --device cpu \
+    --show \
+    --dump-info
+```
+
+转换的关键之一是使用正确的配置文件。项目中已内置了各后端部署[配置文件](https://github.com/open-mmlab/mmdeploy/tree/main/configs/mmseg)。
+文件的命名模式是：
+
+```
+segmentation_{backend}-{precision}_{static | dynamic}_{shape}.py
+```
+
+其中：
+
+- **{backend}:** 推理后端名称。比如，onnxruntime、tensorrt、pplnn、ncnn、openvino、coreml 等等
+- **{precision}:** 推理精度。比如，fp16、int8。不填表示 fp32
+- **{static | dynamic}:** 动态、静态 shape
+- **{shape}:** 模型输入的 shape 或者 shape 范围
+
+在上例中，你也可以把 `unet` 转为其他后端模型。比如使用`segmentation_tensorrt-fp16_dynamic-512x1024-2048x2048.py`，把模型转为 tensorrt-fp16 模型。
+
+```{tip}
+当转 tensorrt 模型时, --device 需要被设置为 "cuda"
+```
+
+## 模型规范
+
+在使用转换后的模型进行推理之前，有必要了解转换结果的结构。 它存放在 `--work-dir` 指定的路路径下。
+
+上例中的`mmdeploy_models/mmseg/ort`，结构如下：
+
+```
+mmdeploy_models/mmseg/ort
+├── deploy.json
+├── detail.json
+├── end2end.onnx
+└── pipeline.json
+```
+
+重要的是：
+
+- **end2end.onnx**: 推理引擎文件。可用 ONNX Runtime 推理
+- \***.json**:  mmdeploy SDK 推理所需的 meta 信息
+
+整个文件夹被定义为**mmdeploy SDK model**。换言之，**mmdeploy SDK model**既包括推理引擎，也包括推理 meta 信息。
+
+## 模型推理
+
+### 后端模型推理
+
+以上述模型转换后的 `end2end.onnx` 为例，你可以使用如下代码进行推理：
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg = 'configs/mmseg/segmentation_onnxruntime_dynamic.py'
+model_cfg = './unet-s5-d16_fcn_4xb4-160k_cityscapes-512x1024.py'
+device = 'cpu'
+backend_model = ['./mmdeploy_models/mmseg/ort/end2end.onnx']
+image = './demo/resources/cityscapes.png'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='./output_segmentation.png')
+```
+
+### SDK 模型推理
+
+你也可以参考如下代码，对 SDK model 进行推理：
+
+```python
+from mmdeploy_runtime import Segmentor
+import cv2
+import numpy as np
+
+img = cv2.imread('./demo/resources/cityscapes.png')
+
+# create a classifier
+segmentor = Segmentor(model_path='./mmdeploy_models/mmseg/ort', device_name='cpu', device_id=0)
+# perform inference
+seg = segmentor(img)
+
+# visualize inference result
+## random a palette with size 256x3
+palette = np.random.randint(0, 256, size=(256, 3))
+color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
+for label, color in enumerate(palette):
+    color_seg[seg == label, :] = color
+# convert to BGR
+color_seg = color_seg[..., ::-1]
+img = img * 0.5 + color_seg * 0.5
+img = img.astype(np.uint8)
+cv2.imwrite('output_segmentation.png', img)
+```
+
+除了python API，mmdeploy SDK 还提供了诸如 C、C++、C#、Java等多语言接口。
+你可以参考[样例](https://github.com/open-mmlab/mmdeploy/tree/main/demo)学习其他语言接口的使用方法。
+
+## 模型支持列表
+
+| Model                                                                                                     | TorchScript | OnnxRuntime | TensorRT | ncnn | PPLNN | OpenVino |
+| :-------------------------------------------------------------------------------------------------------- | :---------: | :---------: | :------: | :--: | :---: | :------: |
+| [FCN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/fcn)                                 |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [PSPNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/pspnet)[\*](#static_shape)        |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [DeepLabV3](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/deeplabv3)                     |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [DeepLabV3+](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/deeplabv3plus)                |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [Fast-SCNN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/fastscnn)[\*](#static_shape)   |      Y      |      Y      |    Y     |  N   |   Y   |    Y     |
+| [UNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/unet)                               |      Y      |      Y      |    Y     |  Y   |   Y   |    Y     |
+| [ANN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/ann)[\*](#static_shape)              |      Y      |      Y      |    Y     |  N   |   N   |    N     |
+| [APCNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/apcnet)                           |      Y      |      Y      |    Y     |  Y   |   N   |    N     |
+| [BiSeNetV1](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/bisenetv1)                     |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [BiSeNetV2](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/bisenetv2)                     |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [CGNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/cgnet)                             |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [DMNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/dmnet)                             |      ?      |      Y      |    N     |  N   |   N   |    N     |
+| [DNLNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/dnlnet)                           |      ?      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [EMANet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/emanet)                           |      Y      |      Y      |    Y     |  N   |   N   |    Y     |
+| [EncNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/encnet)                           |      Y      |      Y      |    Y     |  N   |   N   |    Y     |
+| [ERFNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/erfnet)                           |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [FastFCN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/fastfcn)                         |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [GCNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/gcnet)                             |      Y      |      Y      |    Y     |  N   |   N   |    N     |
+| [ICNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/icnet)[\*](#static_shape)          |      Y      |      Y      |    Y     |  N   |   N   |    Y     |
+| [ISANet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/isanet)[\*](#static_shape)        |      N      |      Y      |    Y     |  N   |   N   |    Y     |
+| [NonLocal Net](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/nonlocal_net)               |      ?      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [OCRNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/ocrnet)                           |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [PointRend](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/point_rend)[\*](#static_shape) |      Y      |      Y      |    Y     |  N   |   N   |    N     |
+| [Semantic FPN](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/sem_fpn)                    |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [STDC](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/stdc)                               |      Y      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [UPerNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/upernet)[\*](#static_shape)      |      N      |      Y      |    Y     |  N   |   N   |    N     |
+| [DANet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/danet)                             |      ?      |      Y      |    Y     |  N   |   N   |    Y     |
+| [Segmenter](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/segmenter)[\*](#static_shape)  |      N      |      Y      |    Y     |  Y   |   N   |    Y     |
+| [SegFormer](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/segformer)[\*](#static_shape)  |      ?      |      Y      |    Y     |  N   |   N   |    Y     |
+| [SETR](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/setr)                               |      ?      |      Y      |    N     |  N   |   N   |    Y     |
+| [CCNet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/ccnet)                             |      ?      |      N      |    N     |  N   |   N   |    N     |
+| [PSANet](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/psanet)                           |      ?      |      N      |    N     |  N   |   N   |    N     |
+| [DPT](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/dpt)                                 |      ?      |      N      |    N     |  N   |   N   |    N     |
+
+## 注意事项
+
+- 所有 mmseg 模型仅支持 "whole" 推理模式。
+
+- <i id=“static_shape”>PSPNet，Fast-SCNN</i> 仅支持静态输入，因为多数推理框架的 [nn.AdaptiveAvgPool2d](https://github.com/open-mmlab/mmsegmentation/blob/0c87f7a0c9099844eff8e90fa3db5b0d0ca02fee/mmseg/models/decode_heads/psp_head.py#L38) 不支持动态输入。
+
+- 对于仅支持静态形状的模型，应使用静态形状的部署配置文件，例如 `configs/mmseg/segmentation_tensorrt_static-1024x2048.py`
+
+- 对于喜欢部署模型生成概率特征图的用户，将 `codebase_config = dict(with_argmax=False)` 放在部署配置中就足够了。
diff --git a/docs/zh_cn/user_guides/deploy_jetson.md b/docs/zh_cn/user_guides/deploy_jetson.md
new file mode 100644
index 0000000000..6cebd9caa1
--- /dev/null
+++ b/docs/zh_cn/user_guides/deploy_jetson.md
@@ -0,0 +1,372 @@
+# 将 MMSeg 模型调优及部署到 NVIDIA Jetson 平台教程
+
+- 请先查阅[MMSegmentation 模型部署](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/5_deployment.md)文档。
+- **本教程所用 mmsegmentation 版本： v1.1.2**
+- **本教程所用 NVIDIA Jetson 设备： NVIDIA Jetson AGX Orin 64G**
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/b5466cfd-71a9-4e06-9823-c253a97d57b5" alt="Smiley face" width="50%">
+</div>
+
+## 1 配置 [mmsegmentation](https://github.com/open-mmlab/mmsegmentation)
+
+- 根据[安装和验证](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/get_started.md)文档，完成开发 [mmsegmentation](https://github.com/open-mmlab/mmsegmentation) 所需的 [`pytorch`](https://pytorch.org/get-started/locally/)、[`mmcv`](https://github.com/open-mmlab/mmcv)、[`mmengine`](https://github.com/open-mmlab/mmengine) 等环境依赖安装。
+- 从 GitHub 使用 git clone 命令完成 [mmsegmentation](https://github.com/open-mmlab/mmsegmentation) 下载。网络不好的同学，可通过 [MMSeg GitHub](https://github.com/open-mmlab/mmsegmentation) 页面进行 zip 的下载。
+  ```bash
+  git clone https://github.com/open-mmlab/mmsegmentation.git
+  ```
+- 使用 `pip install -v -e.` 命令动态安装 mmsegmentation 。
+  ```bash
+  cd mmsegmentation
+  pip install -v -e .
+  ```
+  提示成功安装后，可通过 `pip list` 命令查看到 mmsegmentation 已通过本地安装方式安装到了您的环境中。
+  ![mmseg-install](https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/a9c7bcc9-cdcc-40a4-bd7b-8153195549c8)
+
+## 2 准备您的数据集
+
+- 本教程使用遥感图像语义分割数据集 [potsdam](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#isprs-potsdam) 作为示例。
+- 根据 [potsdam 数据准备](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#isprs-potsdam)文档，进行数据集下载及 MMSeg 格式的准备。
+- 数据集介绍： potsdam 数据集是以德国一个典型的历史城市 Potsdam 命名的，该城市有着大建筑群、狭窄的街道和密集的建筑结构。 potsdam 数据集包含 38 幅 6000x6000 像素的图像，空间分辨率为 5cm，数据集的示例如下图：
+  ![potsdam-img](https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/3bc0a75b-1693-4ae6-aeea-ad502e955068)
+
+## 3 从 config 页面下载模型的 pth 权重文件
+
+这里以 [`deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py`](../../configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py) 配置文件举例，在 [configs](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/deeplabv3plus#potsdam) 页面下载权重文件，
+![pth](https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/8f747362-caf4-406c-808d-4ca72babb209)
+
+## 4 通过 [OpenMMLab deployee](https://platform.openmmlab.com/deploee) 以交互式方式进行模型转换及测速
+
+### 4.1 模型转换
+
+在该部分中，[OpenMMLab 官网](https://platform.openmmlab.com/deploee)提供了模型转换及模型测速的交互界面，无需任何代码，即可通过选择对应选项完成模型 ONNX 格式`xxxx.onnx` 和 TensorRT `.engine`格式的转换。
+如您的自定义 config 文件中有相对引用关系，如：
+
+```python
+# xxxx.py
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/potsdam.py',
+    '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+```
+
+您可以使用以下代码消除相对引用关系，以生成完整的 config 文件。
+
+```python
+import mmengine
+
+mmengine.Config.fromfile("configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py").dump("My_config.py")
+```
+
+使用上述代码后，您能够看到，在`My_config.py`包含着完整的配置文件，无相对引用。这时，上传模型 config 至网页内对应处。
+
+#### 创建转换任务
+
+按照下图提示及自己的需求，创建转换任务并提交。
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/4918d2f9-d63c-480f-97f1-054529770cfd" alt="NVIDIA-Jetson" width="80%">
+</div>
+
+### 4.2 模型测速
+
+在完成模型转换后可通过**模型测速**界面，完成在真实设备上的模型测速。
+
+#### 创建测速任务
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/27340556-c81a-4ce3-8560-2c4727d3355e" alt="NVIDIA-Jetson" width="100%">
+</div>
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/6f4fc3a9-ba9d-4829-8407-ed1470ba7bf3" alt="NVIDIA-Jetson" width="100%">
+</div>
+
+测速完成后，可在页面生成完整的测速报告。[查看测速报告示例](https://openmmlab-deploee.oss-cn-shanghai.aliyuncs.com/tmp/profile_speed/4352f5.txt)
+
+## 5 通过 OpenMMLab mmdeploy 以命令行将模型转换为ONNX格式
+
+该部分可以通过 mmdeploy 库对 mmseg 训练好的模型进行推理格式的转换。这里给出一个示例，具体文档可见[ mmdeploy 模型转换文档](../../docs/zh_cn/user_guides/5_deployment.md)。
+
+### 5.1 通过源码构建 mmdeploy 库
+
+在您安装 mmsegmentation 库的虚拟环境下，通过 `git clone`命令从 GitHub 克隆 [mmdeploy](https://github.com/open-mmlab/mmdeploy)
+
+### 5.2 模型转换
+
+如您的 config 中含有相对引用，仍需进行消除，如[4.1 模型转换](#4.1-模型转换)所述,
+进入 mmdeploy 文件夹，执行以下命令，即可完成模型转换。
+
+```bash
+python tools/deploy.py \
+    configs/mmseg/segmentation_onnxruntime_static-512x512.py \
+    ../atl_config.py \
+    ../deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth \
+    ../2_13_1024_5488_1536_6000.png \
+    --work-dir ../atl_models \
+    --device cpu \
+    --show \
+    --dump-info
+```
+
+```bash
+# 使用方法
+python ./tools/deploy.py \
+    ${部署配置文件路径} \
+    ${模型配置文件路径} \
+    ${模型权重路径} \
+    ${输入图像路径} \
+    --work-dir ${用来保存日志和模型文件路径} \
+    --device ${cpu/cuda:0} \
+    --show \    # 是否显示检测的结果
+    --dump-info # 是否输出 SDK 信息
+
+```
+
+执行成功后，您将能够看到以下提示，即为转换成功。
+
+```bash
+10/08 17:40:44 - mmengine - INFO - visualize pytorch model success.
+10/08 17:40:44 - mmengine - INFO - All process success.
+```
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/b752ccf8-903f-4ad3-ad7c-74fc25cb89a5" alt="NVIDIA-Jetson" width="400">
+</div>
+
+# 6 在 Jetson 平台进行转换及部署
+
+## 6.1 环境准备
+
+参考[如何在 Jetson 模组上安装 MMDeploy](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/01-how-to-build/jetsons.md)文档，完成在 Jetson 上的环境准备工作。
+**注**：安装 Pytorch，可查阅 [NVIDIA Jetson Pytorch 安装文档](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/01-how-to-build/jetsons.md)安装最新的 Pytorch。
+
+### 6.1.1 创建虚拟环境
+
+```bash
+conda create -n {您虚拟环境的名字} python={python版本}
+```
+
+### 6.1.2 虚拟环境内安装Pytorch
+
+<font color="red">注意：</font>这里不要安装最新的 pytorch 2.0，因为 pyTorch 1.11 是最后一个使用 USE_DISTRIBUTED 构建的wheel，否则会在用mmdeploy进行模型转换的时候提示`AttributeError: module 'torch.distributed' has no attribute 'ReduceOp'`的错误。参考以下链接：https://forums.developer.nvidia.com/t/module-torch-distributed-has-no-attribute-reduceop/256581/6
+下载`torch-1.11.0-cp38-cp38-linux_aarch64.whl`并安装
+
+```bash
+pip install torch-1.11.0-cp38-cp38-linux_aarch64.whl
+```
+
+执行以上命令后，您将能看到以下提示，即为安装成功。
+
+```bash
+Processing ./torch-1.11.0-cp38-cp38-linux_aarch64.whl
+Requirement already satisfied: typing-extensions in /home/sirs/miniconda3/envs/openmmlab/lib/python3.8/site-packages (from torch==1.11.0) (4.7.1)
+Installing collected packages: torch
+Successfully installed torch-1.11.0
+```
+
+### 6.1.3 将 Jetson Pack 自带的 tensorrt 拷贝至虚拟环境下
+
+请参考[配置 TensorRT](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/01-how-to-build/jetsons.md#%E9%85%8D%E7%BD%AE-tensorrt)。
+JetPack SDK 自带 TensorRT。 但是为了能够在 Conda 环境中成功导入，我们需要将 TensorRT 拷贝进先前创建的 Conda 环境中。
+
+```bash
+export PYTHON_VERSION=`python3 --version | cut -d' ' -f 2 | cut -d'.' -f1,2`
+cp -r /usr/lib/python${PYTHON_VERSION}/dist-packages/tensorrt* ~/miniconda/envs/{您的虚拟环境名字}/lib/python${PYTHON_VERSION}/site-packages/
+```
+
+### 6.1.4 安装 MMCV
+
+通过`mim install mmcv`或从源码对其进行编译。
+
+```bash
+pip install openmim
+mim install mmcv
+```
+
+或者从源码对其进行编译。
+
+```bash
+sudo apt-get install -y libssl-dev
+git clone https://github.com/open-mmlab/mmcv.git
+cd mmcv
+pip install -e .
+```
+
+<font color="red">注：pytorch版本发生变动后，需要重新编译mmcv。</font>
+
+### 6.1.5 安装 ONNX
+
+<font color="red">注：以下方式二选一</font>
+
+- conda
+  ```bash
+  conda install -c conda-forge onnx
+  ```
+- pip
+  ```bash
+  python3 -m pip install onnx
+  ```
+
+### 6.1.6 安装 ONNX Runtime
+
+根据网页 [ONNX Runtime](https://elinux.org/Jetson_Zoo#ONNX_Runtime) 选择合适的ONNX Runtime版本进行下载安装。
+示例：
+
+```bash
+# Install pip wheel
+$ pip3 install onnxruntime_gpu-1.10.0-cp38-cp38-linux_aarch64.whl
+
+```
+
+## 6.2 在 Jetson AGX Orin 进行模型转换及推理
+
+### 6.2.1 ONNX 模型转换
+
+同[4.1 模型转换](#4.1-模型转换)相同，在 Jetson 平台下进入安装好的虚拟环境，以及mmdeploy 目录，进行模型ONNX转换。
+
+```bash
+python tools/deploy.py \
+    configs/mmseg/segmentation_onnxruntime_static-512x512.py \
+    ../atl_config.py \
+    ../deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth \
+    ../2_13_3584_2560_4096_3072.png \
+    --work-dir ../atl_models \
+    --device cpu \
+    --show \
+    --dump-info
+
+```
+
+<font color="red">注：</font> 如果报错提示内容：
+
+```none
+AttributeError: module 'torch.distributed' has no attribute 'ReduceOp'
+```
+
+可参考以下链接进行解决：https://forums.developer.nvidia.com/t/module-torch-distributed-has-no-attribute-reduceop/256581/6，即安装 pytorch 1.11.0 版本。
+
+转换成功后，您将会看到如下信息以及包含 ONNX 模型的文件夹：
+
+```bash
+10/09 19:58:22 - mmengine - INFO - visualize pytorch model success.
+10/09 19:58:22 - mmengine - INFO - All process success.
+```
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/d68f1cf6-0e80-4261-91a3-6046b17de146" alt="NVIDIA-Jetson" width="400">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/70470a39-6a4f-4fd5-a06d-9b9d59a768ef" alt="NVIDIA-Jetson" width="160">
+</div>
+
+### 6.2.2 TensorRT 模型转换
+
+更换部署trt配置文件，进行 TensorRT 模型转换。
+
+```bash
+python tools/deploy.py \
+    configs/mmseg/segmentation_tensorrt_static-512x512.py \
+    ../atl_config.py \
+    ../deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth \
+    ../2_13_3584_2560_4096_3072.png \
+    --work-dir ../atl_trt_models \
+    --device cuda:0 \
+    --show \
+    --dump-info
+
+```
+
+转换成功后您将看到以下信息及 TensorRT 模型文件夹：
+
+```bash
+10/09 20:15:50 - mmengine - INFO - visualize pytorch model success.
+10/09 20:15:50 - mmengine - INFO - All process success.
+```
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/2ac1428f-b787-4fdd-beaf-6397e5b21e33" alt="NVIDIA-Jetson" width="340">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/70470a39-6a4f-4fd5-a06d-9b9d59a768ef" alt="NVIDIA-Jetson" width="200">
+</div>
+
+## 6.3 模型测速
+
+执行以下命令完成模型测速，详细内容请查看[ profiler ](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/02-how-to-run/useful_tools.md#profiler)
+
+```bash
+python tools/profiler.py \
+    ${DEPLOY_CFG} \
+    ${MODEL_CFG} \
+    ${IMAGE_DIR} \
+    --model ${MODEL} \
+    --device ${DEVICE} \
+    --shape ${SHAPE} \
+    --num-iter ${NUM_ITER} \
+    --warmup ${WARMUP} \
+    --cfg-options ${CFG_OPTIONS} \
+    --batch-size ${BATCH_SIZE} \
+    --img-ext ${IMG_EXT}
+```
+
+示例：
+
+```bash
+python tools/profiler.py \
+    configs/mmseg/segmentation_tensorrt_static-512x512.py \
+    ../atl_config.py \
+    ../atl_demo_img \
+    --model /home/sirs/AI-Tianlong/OpenMMLab/atl_trt_models/end2end.engine \
+    --device cuda:0 \
+    --shape 512x512 \
+    --num-iter 100
+```
+
+测速结果
+
+![image](https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/874e9742-ee10-490c-9e69-17da0096c49b)
+
+## 6.4 模型推理
+
+根据[6.2.2](#6.2.2-TensorRT-模型转换)中生成的TensorRT模型文件夹，进行模型推理。
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg='./mmdeploy/configs/mmseg/segmentation_tensorrt_static-512x512.py'
+model_cfg='./atl_config.py'
+device='cuda:0'
+backend_model = ['./atl_trt_models/end2end.engine']
+image = './atl_demo_img/2_13_2048_1024_2560_1536.png'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='./output_segmentation.png')
+```
+
+即可得到推理结果：
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/d0ae1fa8-e223-4b3f-b699-6bfa8db38133" alt="NVIDIA-Jetson" width="40%">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/6d999cbe-2101-4e1b-b4a9-13115c9d1928" alt="NVIDIA-Jetson" width="40%">
+</div>
diff --git a/docs/zh_cn/user_guides/index.rst b/docs/zh_cn/user_guides/index.rst
new file mode 100644
index 0000000000..d0a313d31e
--- /dev/null
+++ b/docs/zh_cn/user_guides/index.rst
@@ -0,0 +1,21 @@
+训练 & 测试
+**************
+
+.. toctree::
+   :maxdepth: 1
+
+   1_config.md
+   2_dataset_prepare.md
+   3_inference.md
+   4_train_test.md
+
+实用工具
+*************
+
+.. toctree::
+   :maxdepth: 2
+
+   visualization.md
+   useful_tools.md
+   deployment.md
+   visualization_feature_map.md
diff --git a/docs/zh_cn/user_guides/useful_tools.md b/docs/zh_cn/user_guides/useful_tools.md
new file mode 100644
index 0000000000..acbacb950f
--- /dev/null
+++ b/docs/zh_cn/user_guides/useful_tools.md
@@ -0,0 +1,368 @@
+## 常用工具（待更新）
+
+除了训练和测试的脚本，我们在 `tools/` 文件夹路径下还提供许多有用的工具。
+
+### 计算参数量（params）和计算量（ FLOPs） (试验性)
+
+我们基于 [flops-counter.pytorch](https://github.com/sovrasov/flops-counter.pytorch)
+提供了一个用于计算给定模型参数量和计算量的脚本。
+
+```shell
+python tools/get_flops.py ${CONFIG_FILE} [--shape ${INPUT_SHAPE}]
+```
+
+您将得到如下的结果：
+
+```none
+==============================
+Input shape: (3, 2048, 1024)
+Flops: 1429.68 GMac
+Params: 48.98 M
+==============================
+```
+
+**注意**: 这个工具仍然是试验性的，我们无法保证数字是正确的。您可以拿这些结果做简单的实验的对照，在写技术文档报告或者论文前您需要再次确认一下。
+
+(1) 计算量与输入的形状有关，而参数量与输入的形状无关，默认的输入形状是 (1, 3, 1280, 800)；
+(2) 一些运算操作，如 GN 和其他定制的运算操作没有加入到计算量的计算中。
+
+### 发布模型
+
+在您上传一个模型到云服务器之前，您需要做以下几步：
+(1) 将模型权重转成 CPU 张量；
+(2) 删除记录优化器状态 (optimizer states)的相关信息；
+(3) 计算检查点文件 (checkpoint file) 的哈希编码（hash id）并且将哈希编码加到文件名中。
+
+```shell
+python tools/publish_model.py ${INPUT_FILENAME} ${OUTPUT_FILENAME}
+```
+
+例如，
+
+```shell
+python tools/publish_model.py work_dirs/pspnet/latest.pth psp_r50_hszhao_200ep.pth
+```
+
+最终输出文件将是 `psp_r50_512x1024_40ki_cityscapes-{hash id}.pth`。
+
+### 导出 ONNX (试验性)
+
+我们提供了一个脚本来导出模型到 [ONNX](https://github.com/onnx/onnx) 格式。被转换的模型可以通过工具 [Netron](https://github.com/lutzroeder/netron)
+来可视化。除此以外，我们同样支持对 PyTorch 和 ONNX 模型的输出结果做对比。
+
+```bash
+python tools/pytorch2onnx.py \
+    ${CONFIG_FILE} \
+    --checkpoint ${CHECKPOINT_FILE} \
+    --output-file ${ONNX_FILE} \
+    --input-img ${INPUT_IMG} \
+    --shape ${INPUT_SHAPE} \
+    --rescale-shape ${RESCALE_SHAPE} \
+    --show \
+    --verify \
+    --dynamic-export \
+    --cfg-options \
+      model.test_cfg.mode="whole"
+```
+
+各个参数的描述:
+
+- `config` : 模型配置文件的路径
+- `--checkpoint` : 模型检查点文件的路径
+- `--output-file`: 输出的 ONNX 模型的路径。如果没有专门指定，它默认是 `tmp.onnx`
+- `--input-img` : 用来转换和可视化的一张输入图像的路径
+- `--shape`: 模型的输入张量的高和宽。如果没有专门指定，它将被设置成 `test_pipeline` 的 `img_scale`
+- `--rescale-shape`: 改变输出的形状。设置这个值来避免 OOM，它仅在 `slide` 模式下可以用
+- `--show`: 是否打印输出模型的结构。如果没有被专门指定，它将被设置成 `False`
+- `--verify`: 是否验证一个输出模型的正确性 (correctness)。如果没有被专门指定，它将被设置成 `False`
+- `--dynamic-export`: 是否导出形状变化的输入与输出的 ONNX 模型。如果没有被专门指定，它将被设置成 `False`
+- `--cfg-options`: 更新配置选项
+
+**注意**: 这个工具仍然是试验性的，目前一些自定义操作还没有被支持
+
+### 评估 ONNX 模型
+
+我们提供 `tools/deploy_test.py` 去评估不同后端的 ONNX 模型。
+
+#### 先决条件
+
+- 安装 onnx 和 onnxruntime-gpu
+
+  ```shell
+  pip install onnx onnxruntime-gpu
+  ```
+
+- 参考 [如何在 MMCV 里构建 tensorrt 插件](https://mmcv.readthedocs.io/en/latest/tensorrt_plugin.html#how-to-build-tensorrt-plugins-in-mmcv) 安装TensorRT (可选)
+
+#### 使用方法
+
+```bash
+python tools/deploy_test.py \
+    ${CONFIG_FILE} \
+    ${MODEL_FILE} \
+    ${BACKEND} \
+    --out ${OUTPUT_FILE} \
+    --eval ${EVALUATION_METRICS} \
+    --show \
+    --show-dir ${SHOW_DIRECTORY} \
+    --cfg-options ${CFG_OPTIONS} \
+    --eval-options ${EVALUATION_OPTIONS} \
+    --opacity ${OPACITY} \
+```
+
+各个参数的描述:
+
+- `config`: 模型配置文件的路径
+- `model`: 被转换的模型文件的路径
+- `backend`: 推理的后端，可选项：`onnxruntime`， `tensorrt`
+- `--out`: 输出结果成 pickle 格式文件的路径
+- `--format-only` : 不评估直接给输出结果的格式。通常用在当您想把结果输出成一些测试服务器需要的特定格式时。如果没有被专门指定，它将被设置成 `False`。 注意这个参数是用 `--eval` 来 **手动添加**
+- `--eval`: 评估指标，取决于每个数据集的要求，例如 "mIoU" 是大多数据集的指标而 "cityscapes" 仅针对 Cityscapes 数据集。注意这个参数是用 `--format-only` 来 **手动添加**
+- `--show`: 是否展示结果
+- `--show-dir`: 涂上结果的图像被保存的文件夹的路径
+- `--cfg-options`: 重写配置文件里的一些设置，`xxx=yyy` 格式的键值对将被覆盖到配置文件里
+- `--eval-options`: 自定义的评估的选项， `xxx=yyy` 格式的键值对将成为  `dataset.evaluate()` 函数的参数变量
+- `--opacity`: 涂上结果的分割图的透明度，范围在 (0, 1\] 之间
+
+#### 结果和模型
+
+|    模型    |                    配置文件                     |   数据集   | 评价指标 | PyTorch | ONNXRuntime | TensorRT-fp32 | TensorRT-fp16 |
+| :--------: | :---------------------------------------------: | :--------: | :------: | :-----: | :---------: | :-----------: | :-----------: |
+|    FCN     |      fcn_r50-d8_512x1024_40k_cityscapes.py      | cityscapes |   mIoU   |  72.2   |    72.2     |     72.2      |     72.2      |
+|   PSPNet   |    pspnet_r50-d8_512x1024_40k_cityscapes.py     | cityscapes |   mIoU   |  77.8   |    77.8     |     77.8      |     77.8      |
+| deeplabv3  |   deeplabv3_r50-d8_512x1024_40k_cityscapes.py   | cityscapes |   mIoU   |  79.0   |    79.0     |     79.0      |     79.0      |
+| deeplabv3+ | deeplabv3plus_r50-d8_512x1024_40k_cityscapes.py | cityscapes |   mIoU   |  79.6   |    79.5     |     79.5      |     79.5      |
+|   PSPNet   |     pspnet_r50-d8_769x769_40k_cityscapes.py     | cityscapes |   mIoU   |  78.2   |    78.1     |               |               |
+| deeplabv3  |   deeplabv3_r50-d8_769x769_40k_cityscapes.py    | cityscapes |   mIoU   |  78.5   |    78.3     |               |               |
+| deeplabv3+ | deeplabv3plus_r50-d8_769x769_40k_cityscapes.py  | cityscapes |   mIoU   |  78.9   |    78.7     |               |               |
+
+**注意**: TensorRT 仅在使用 `whole mode` 测试模式时的配置文件里可用。
+
+### 导出 TorchScript (试验性)
+
+我们同样提供一个脚本去把模型导出成 [TorchScript](https://pytorch.org/docs/stable/jit.html) 格式。您可以使用 pytorch C++ API [LibTorch](https://pytorch.org/docs/stable/cpp_index.html) 去推理训练好的模型。
+被转换的模型能被像 [Netron](https://github.com/lutzroeder/netron) 的工具来可视化。此外，我们还支持 PyTorch 和 TorchScript 模型的输出结果的比较。
+
+```shell
+python tools/pytorch2torchscript.py \
+    ${CONFIG_FILE} \
+    --checkpoint ${CHECKPOINT_FILE} \
+    --output-file ${ONNX_FILE}
+    --shape ${INPUT_SHAPE}
+    --verify \
+    --show
+```
+
+各个参数的描述:
+
+- `config` : pytorch 模型的配置文件的路径
+- `--checkpoint` : pytorch 模型的检查点文件的路径
+- `--output-file`: TorchScript 模型输出的路径，如果没有被专门指定，它将被设置成 `tmp.pt`
+- `--input-img` : 用来转换和可视化的输入图像的路径
+- `--shape`: 模型的输入张量的宽和高。如果没有被专门指定，它将被设置成 `512 512`
+- `--show`: 是否打印输出模型的追踪图 (traced graph)，如果没有被专门指定，它将被设置成 `False`
+- `--verify`: 是否验证一个输出模型的正确性 (correctness)，如果没有被专门指定，它将被设置成 `False`
+
+**注意**: 目前仅支持 PyTorch>=1.8.0 版本
+
+**注意**: 这个工具仍然是试验性的，一些自定义操作符目前还不被支持
+
+例子:
+
+- 导出 PSPNet 在 cityscapes 数据集上的 pytorch 模型
+
+  ```shell
+  python tools/pytorch2torchscript.py configs/pspnet/pspnet_r50-d8_512x1024_40k_cityscapes.py \
+  --checkpoint checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pth \
+  --output-file checkpoints/pspnet_r50-d8_512x1024_40k_cityscapes_20200605_003338-2966598c.pt \
+  --shape 512 1024
+  ```
+
+### 导出 TensorRT (试验性)
+
+一个导出 [ONNX](https://github.com/onnx/onnx) 模型成 [TensorRT](https://developer.nvidia.com/tensorrt) 格式的脚本
+
+先决条件
+
+- 按照 [ONNXRuntime in mmcv](https://mmcv.readthedocs.io/en/latest/deployment/onnxruntime_op.html) 和 [TensorRT plugin in mmcv](https://github.com/open-mmlab/mmcv/blob/master/docs/en/deployment/tensorrt_plugin.md) ，用 ONNXRuntime 自定义运算 (custom ops) 和 TensorRT 插件安装 `mmcv-full`
+- 使用 [pytorch2onnx](#convert-to-onnx-experimental) 将模型从 PyTorch 转成 ONNX
+
+使用方法
+
+```bash
+python ${MMSEG_PATH}/tools/onnx2tensorrt.py \
+    ${CFG_PATH} \
+    ${ONNX_PATH} \
+    --trt-file ${OUTPUT_TRT_PATH} \
+    --min-shape ${MIN_SHAPE} \
+    --max-shape ${MAX_SHAPE} \
+    --input-img ${INPUT_IMG} \
+    --show \
+    --verify
+```
+
+各个参数的描述:
+
+- `config` : 模型的配置文件
+- `model` : 输入的 ONNX 模型的路径
+- `--trt-file` : 输出的 TensorRT 引擎的路径
+- `--max-shape` : 模型的输入的最大形状
+- `--min-shape` : 模型的输入的最小形状
+- `--fp16` : 做 fp16 模型转换
+- `--workspace-size` : 在 GiB 里的最大工作空间大小 (Max workspace size)
+- `--input-img` : 用来可视化的图像
+- `--show` : 做结果的可视化
+- `--dataset` : Palette provider, 默认为 `CityscapesDataset`
+- `--verify` : 验证 ONNXRuntime 和 TensorRT 的输出
+- `--verbose` : 当创建 TensorRT 引擎时，是否详细做信息日志。默认为 False
+
+**注意**: 仅在全图测试模式 (whole mode) 下测试过
+
+## 其他内容
+
+### 打印完整的配置文件
+
+`tools/print_config.py` 会逐字逐句的打印整个配置文件，展开所有的导入。
+
+```shell
+python tools/print_config.py \
+  ${CONFIG} \
+  --graph \
+  --cfg-options ${OPTIONS [OPTIONS...]} \
+```
+
+各个参数的描述:
+
+- `config` : pytorch 模型的配置文件的路径
+- `--graph` : 是否打印模型的图 (models graph)
+- `--cfg-options`: 自定义替换配置文件的选项
+
+### 对训练日志 (training logs) 画图
+
+`tools/analyze_logs.py` 会画出给定的训练日志文件的 loss/mIoU 曲线，首先需要 `pip install seaborn` 安装依赖包。
+
+```shell
+python tools/analyze_logs.py xxx.log.json [--keys ${KEYS}] [--legend ${LEGEND}] [--backend ${BACKEND}] [--style ${STYLE}] [--out ${OUT_FILE}]
+```
+
+示例:
+
+- 对 mIoU, mAcc, aAcc 指标画图
+
+  ```shell
+  python tools/analyze_logs.py log.json --keys mIoU mAcc aAcc --legend mIoU mAcc aAcc
+  ```
+
+- 对 loss 指标画图
+
+  ```shell
+  python tools/analyze_logs.py log.json --keys loss --legend loss
+  ```
+
+### 转换其他仓库的权重
+
+`tools/model_converters/` 提供了若干个预训练权重转换脚本，支持将其他仓库的预训练权重的 key 转换为与 MMSegmentation 相匹配的 key。
+
+#### ViT Swin MiT Transformer 模型
+
+- ViT
+
+`tools/model_converters/vit2mmseg.py` 将 timm 预训练模型转换到 MMSegmentation。
+
+```shell
+python tools/model_converters/vit2mmseg.py ${SRC} ${DST}
+```
+
+- Swin
+
+  `tools/model_converters/swin2mmseg.py` 将官方预训练模型转换到 MMSegmentation。
+
+  ```shell
+  python tools/model_converters/swin2mmseg.py ${SRC} ${DST}
+  ```
+
+- SegFormer
+
+  `tools/model_converters/mit2mmseg.py` 将官方预训练模型转换到 MMSegmentation。
+
+  ```shell
+  python tools/model_converters/mit2mmseg.py ${SRC} ${DST}
+  ```
+
+## 模型服务
+
+为了用 [`TorchServe`](https://pytorch.org/serve/) 服务 `MMSegmentation` 的模型 ， 您可以遵循如下流程:
+
+### 1. 将 model 从　MMSegmentation 转换到 TorchServe
+
+```shell
+python tools/mmseg2torchserve.py ${CONFIG_FILE} ${CHECKPOINT_FILE} \
+--output-folder ${MODEL_STORE} \
+--model-name ${MODEL_NAME}
+```
+
+**注意**: ${MODEL_STORE} 需要设置为某个文件夹的绝对路径
+
+### 2. 构建 `mmseg-serve` 容器镜像 (docker image)
+
+```shell
+docker build -t mmseg-serve:latest docker/serve/
+```
+
+### 3. 运行 `mmseg-serve`
+
+请查阅官方文档: [使用容器运行 TorchServe](https://github.com/pytorch/serve/blob/master/docker/README.md#running-torchserve-in-a-production-docker-environment)
+
+为了在 GPU 环境下使用, 您需要安装 [nvidia-docker](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html). 若在 CPU 环境下使用，您可以忽略添加 `--gpus` 参数。
+
+示例:
+
+```shell
+docker run --rm \
+--cpus 8 \
+--gpus device=0 \
+-p8080:8080 -p8081:8081 -p8082:8082 \
+--mount type=bind,source=$MODEL_STORE,target=/home/model-server/model-store \
+mmseg-serve:latest
+```
+
+阅读关于推理 (8080), 管理 (8081) 和指标 (8082) APIs 的 [文档](https://github.com/pytorch/serve/blob/072f5d088cce9bb64b2a18af065886c9b01b317b/docs/rest_api.md) 。
+
+### 4. 测试部署
+
+```shell
+curl -O https://raw.githubusercontent.com/open-mmlab/mmsegmentation/master/resources/3dogs.jpg
+curl http://127.0.0.1:8080/predictions/${MODEL_NAME} -T 3dogs.jpg -o 3dogs_mask.png
+```
+
+得到的响应将是一个 ".png" 的分割掩码.
+
+您可以按照如下方法可视化输出:
+
+```python
+import matplotlib.pyplot as plt
+import mmcv
+plt.imshow(mmcv.imread("3dogs_mask.png", "grayscale"))
+plt.show()
+```
+
+看到的东西将会和下图类似:
+
+![3dogs_mask](../../resources/3dogs_mask.png)
+
+然后您可以使用 `test_torchserve.py` 比较 torchserve 和 pytorch 的结果，并将它们可视化。
+
+```shell
+python tools/torchserve/test_torchserve.py ${IMAGE_FILE} ${CONFIG_FILE} ${CHECKPOINT_FILE} ${MODEL_NAME}
+[--inference-addr ${INFERENCE_ADDR}] [--result-image ${RESULT_IMAGE}] [--device ${DEVICE}]
+```
+
+示例：
+
+```shell
+python tools/torchserve/test_torchserve.py \
+demo/demo.png \
+configs/fcn/fcn_r50-d8_512x1024_40k_cityscapes.py \
+checkpoint/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth \
+fcn
+```
diff --git a/docs/zh_cn/user_guides/visualization.md b/docs/zh_cn/user_guides/visualization.md
new file mode 100644
index 0000000000..2ef020ba85
--- /dev/null
+++ b/docs/zh_cn/user_guides/visualization.md
@@ -0,0 +1,173 @@
+# 可视化
+
+MMSegmentation 1.x 提供了简便的方式监控训练时的状态以及可视化在模型预测时的数据。
+
+## 训练状态监控
+
+MMSegmentation 1.x 使用 TensorBoard 来监控训练时候的状态。
+
+### TensorBoard 的配置
+
+安装 TensorBoard 的过程可以按照 [官方安装指南](https://www.tensorflow.org/install) ，具体的步骤如下：
+
+```shell
+pip install tensorboardX
+pip install future tensorboard
+```
+
+在配置文件 `default_runtime.py` 的 `vis_backend` 中添加 `TensorboardVisBackend`。
+
+```python
+vis_backends = [dict(type='LocalVisBackend'),
+                dict(type='TensorboardVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+```
+
+### 检查 TensorBoard 中的标量
+
+启动训练实验的命令如下
+
+```shell
+python tools/train.py configs/pspnet/pspnet_r50-d8_4xb4-80k_ade20k-512x512.py --work-dir work_dir/test_visual
+```
+
+开始训练后找到 `work_dir` 中的 `vis_data` 路径，例如：本次特定测试的 vis_data 路径如下所示：
+
+```shell
+work_dirs/test_visual/20220810_115248/vis_data
+```
+
+vis_data 路径中的标量文件包括了学习率、损失函数和 data_time 等，还记录了指标结果，您可以参考 MMEngine 中的 [记录日志教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/logging.html) 中的日志教程来帮助记录自己定义的数据。 Tensorboard 的可视化结果使用下面的命令执行：
+
+```shell
+tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
+```
+
+## 数据和结果的可视化
+
+### 模型测试或验证期间的可视化数据样本
+
+MMSegmentation 提供了 `SegVisualizationHook` ，它是一个可以用于可视化 ground truth 和在模型测试和验证期间的预测分割结果的[钩子](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/hook.html) 。 它的配置在 `default_hooks` 中，更多详细信息请参见 [执行器教程](https://mmengine.readthedocs.io/zh_CN/latest/tutorials/runner.html)。
+
+例如，在 `_base_/schedules/schedule_20k.py` 中，修改 `SegVisualizationHook` 配置，将 `draw` 设置为 `True` 以启用网络推理结果的存储，`interval` 表示预测结果的采样间隔， 设置为 1 时，将保存网络的每个推理结果。 `interval` 默认设置为 50：
+
+```python
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=2000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook', draw=True, interval=1))
+
+```
+
+启动训练实验后，可视化结果将在 validation loop 存储到本地文件夹中，或者在一个数据集上启动评估模型时，预测结果将存储在本地。本地的可视化的存储结果保存在 `$WORK_DIRS/vis_data` 下的 `vis_image` 中，例如：
+
+```shell
+work_dirs/test_visual/20220810_115248/vis_data/vis_image
+```
+
+另外，如果在 `vis_backends` 中添加 `TensorboardVisBackend` ，如 [TensorBoard 的配置](###TensorBoard的配置)，我们还可以运行下面的命令在 TensorBoard 中查看它们：
+
+```shell
+tensorboard --logdir work_dirs/test_visual/20220810_115248/vis_data
+```
+
+### 可视化单个数据样本
+
+如果你想可视化单个样本数据，我们建议使用 `SegLocalVisualizer` 。
+
+`SegLocalVisualizer`是继承自 MMEngine 中`Visualizer` 类的子类，适用于 MMSegmentation 可视化，有关`Visualizer`的详细信息请参考在 MMEngine 中的[可视化教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html) 。
+
+以下是一个关于 `SegLocalVisualizer` 的示例，首先你可以使用下面的命令下载这个案例中的数据：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png" width="70%"/>
+</div>
+
+```shell
+wget https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png --output-document aachen_000000_000019_leftImg8bit.png
+wget https://user-images.githubusercontent.com/24582831/189833143-15f60f8a-4d1e-4cbb-a6e7-5e2233869fac.png --output-document aachen_000000_000019_gtFine_labelTrainIds.png
+```
+
+然后你可以找到他们本地的路径和使用下面的脚本文件对其进行可视化：
+
+```python
+import mmcv
+import os.path as osp
+import torch
+
+# `PixelData` 是 MMEngine 中用于定义像素级标注或预测的数据结构。
+# 请参考下面的MMEngine数据结构教程文件：
+# https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/data_element.html#pixeldata
+
+from mmengine.structures import PixelData
+
+# `SegDataSample` 是在 MMSegmentation 中定义的不同组件之间的数据结构接口，
+# 它包括 ground truth、语义分割的预测结果和预测逻辑。
+# 详情请参考下面的 `SegDataSample` 教程文件：
+# https://github.com/open-mmlab/mmsegmentation/blob/1.x/docs/en/advanced_guides/structures.md
+
+from mmseg.structures import SegDataSample
+from mmseg.visualization import SegLocalVisualizer
+
+out_file = 'out_file_cityscapes'
+save_dir = './work_dirs'
+
+image = mmcv.imread(
+    osp.join(
+        osp.dirname(__file__),
+        './aachen_000000_000019_leftImg8bit.png'
+    ),
+    'color')
+sem_seg = mmcv.imread(
+    osp.join(
+        osp.dirname(__file__),
+        './aachen_000000_000019_gtFine_labelTrainIds.png'  # noqa
+    ),
+    'unchanged')
+sem_seg = torch.from_numpy(sem_seg)
+gt_sem_seg_data = dict(data=sem_seg)
+gt_sem_seg = PixelData(**gt_sem_seg_data)
+data_sample = SegDataSample()
+data_sample.gt_sem_seg = gt_sem_seg
+
+seg_local_visualizer = SegLocalVisualizer(
+    vis_backends=[dict(type='LocalVisBackend')],
+    save_dir=save_dir)
+
+# 数据集的元信息通常包括类名的 `classes` 和
+# 用于可视化每个前景颜色的 `palette` 。
+# 所有类名和调色板都在此文件中定义：
+# https://github.com/open-mmlab/mmsegmentation/blob/1.x/mmseg/utils/class_names.py
+
+seg_local_visualizer.dataset_meta = dict(
+    classes=('road', 'sidewalk', 'building', 'wall', 'fence',
+             'pole', 'traffic light', 'traffic sign',
+             'vegetation', 'terrain', 'sky', 'person', 'rider',
+             'car', 'truck', 'bus', 'train', 'motorcycle',
+             'bicycle'),
+    palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70],
+             [102, 102, 156], [190, 153, 153], [153, 153, 153],
+             [250, 170, 30], [220, 220, 0], [107, 142, 35],
+             [152, 251, 152], [70, 130, 180], [220, 20, 60],
+             [255, 0, 0], [0, 0, 142], [0, 0, 70],
+             [0, 60, 100], [0, 80, 100], [0, 0, 230],
+             [119, 11, 32]])
+
+# 当`show=True`时，直接显示结果，
+# 当 `show=False`时，结果将保存在本地文件夹中。
+
+seg_local_visualizer.add_datasample(out_file, image,
+                                    data_sample, show=False)
+```
+
+可视化后的图像结果和它的对应的 ground truth 图像可以在 `./work_dirs/vis_data/vis_image/` 路径找到，文件名字是：`out_file_cityscapes_0.png` ：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189835713-c0534054-4bfa-4b75-9254-0afbeb5ff02e.png" width="70%"/>
+</div>
+
+如果你想知道更多的关于可视化的使用指引，你可以参考 MMEngine 中的[可视化教程](<[https://mmengine.readthedocs.io/en/latest/advanced_tutorials/visualization.html](https://github.com/open-mmlab/mmengine/blob/main/docs/zh_cn/advanced_tutorials/visualization.md)>)
diff --git a/docs/zh_cn/user_guides/visualization_feature_map.md b/docs/zh_cn/user_guides/visualization_feature_map.md
new file mode 100644
index 0000000000..fda99bb5ab
--- /dev/null
+++ b/docs/zh_cn/user_guides/visualization_feature_map.md
@@ -0,0 +1,201 @@
+# wandb记录特征图可视化
+
+MMSegmentation 1.x 提供了 Weights & Biases 的后端支持，方便对项目代码结果的可视化和管理。
+
+## Wandb的配置
+
+安装 Weights & Biases 的过程可以参考 [官方安装指南](https://docs.wandb.ai/quickstart)，具体的步骤如下:
+
+```shell
+pip install wandb
+wandb login
+```
+
+在 `vis_backend` 中添加 `WandbVisBackend`。
+
+```python
+vis_backends=[dict(type='LocalVisBackend'),
+              dict(type='TensorboardVisBackend'),
+              dict(type='WandbVisBackend')]
+```
+
+## 测试数据和结果及特征图的可视化
+
+`SegLocalVisualizer` 是继承自 MMEngine 中 `Visualizer` 类的子类，适用于 MMSegmentation 可视化，有关 `Visualizer` 的详细信息请参考在 MMEngine 中的[可视化教程](https://mmengine.readthedocs.io/zh_CN/latest/advanced_tutorials/visualization.html) 。
+
+以下是一个关于 `SegLocalVisualizer` 的示例，首先你可以使用下面的命令下载这个案例中的数据：
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png" width="70%"/>
+</div>
+
+```shell
+wget https://user-images.githubusercontent.com/24582831/189833109-eddad58f-f777-4fc0-b98a-6bd429143b06.png --output-document aachen_000000_000019_leftImg8bit.png
+wget https://user-images.githubusercontent.com/24582831/189833143-15f60f8a-4d1e-4cbb-a6e7-5e2233869fac.png --output-document aachen_000000_000019_gtFine_labelTrainIds.png
+
+wget https://download.openmmlab.com/mmsegmentation/v0.5/ann/ann_r50-d8_512x1024_40k_cityscapes/ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth
+
+```
+
+```python
+# Copyright (c) OpenMMLab. All rights reserved.
+from argparse import ArgumentParser
+from typing import Type
+
+import mmcv
+import torch
+import torch.nn as nn
+
+from mmengine.model import revert_sync_batchnorm
+from mmengine.structures import PixelData
+from mmseg.apis import inference_model, init_model
+from mmseg.structures import SegDataSample
+from mmseg.utils import register_all_modules
+from mmseg.visualization import SegLocalVisualizer
+
+
+class Recorder:
+    """record the forward output feature map and save to data_buffer."""
+
+    def __init__(self) -> None:
+        self.data_buffer = list()
+
+    def __enter__(self, ):
+        self._data_buffer = list()
+
+    def record_data_hook(self, model: nn.Module, input: Type, output: Type):
+        self.data_buffer.append(output)
+
+    def __exit__(self, *args, **kwargs):
+        pass
+
+
+def visualize(args, model, recorder, result):
+    seg_visualizer = SegLocalVisualizer(
+        vis_backends=[dict(type='WandbVisBackend')],
+        save_dir='temp_dir',
+        alpha=0.5)
+    seg_visualizer.dataset_meta = dict(
+        classes=model.dataset_meta['classes'],
+        palette=model.dataset_meta['palette'])
+
+    image = mmcv.imread(args.img, 'color')
+
+    seg_visualizer.add_datasample(
+        name='predict',
+        image=image,
+        data_sample=result,
+        draw_gt=False,
+        draw_pred=True,
+        wait_time=0,
+        out_file=None,
+        show=False)
+
+    # add feature map to wandb visualizer
+    for i in range(len(recorder.data_buffer)):
+        feature = recorder.data_buffer[i][0]  # remove the batch
+        drawn_img = seg_visualizer.draw_featmap(
+            feature, image, channel_reduction='select_max')
+        seg_visualizer.add_image(f'feature_map{i}', drawn_img)
+
+    if args.gt_mask:
+        sem_seg = mmcv.imread(args.gt_mask, 'unchanged')
+        sem_seg = torch.from_numpy(sem_seg)
+        gt_mask = dict(data=sem_seg)
+        gt_mask = PixelData(**gt_mask)
+        data_sample = SegDataSample()
+        data_sample.gt_sem_seg = gt_mask
+
+        seg_visualizer.add_datasample(
+            name='gt_mask',
+            image=image,
+            data_sample=data_sample,
+            draw_gt=True,
+            draw_pred=False,
+            wait_time=0,
+            out_file=None,
+            show=False)
+
+    seg_visualizer.add_image('image', image)
+
+
+def main():
+    parser = ArgumentParser(
+        description='Draw the Feature Map During Inference')
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument('--gt_mask', default=None, help='Path of gt mask file')
+    parser.add_argument('--out-file', default=None, help='Path to output file')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    parser.add_argument(
+        '--opacity',
+        type=float,
+        default=0.5,
+        help='Opacity of painted segmentation map. In (0, 1] range.')
+    parser.add_argument(
+        '--title', default='result', help='The image identifier.')
+    args = parser.parse_args()
+
+    register_all_modules()
+
+    # build the model from a config file and a checkpoint file
+    model = init_model(args.config, args.checkpoint, device=args.device)
+    if args.device == 'cpu':
+        model = revert_sync_batchnorm(model)
+
+    # show all named module in the model and use it in source list below
+    for name, module in model.named_modules():
+        print(name)
+
+    source = [
+        'decode_head.fusion.stages.0.query_project.activate',
+        'decode_head.context.stages.0.key_project.activate',
+        'decode_head.context.bottleneck.activate'
+    ]
+    source = dict.fromkeys(source)
+
+    count = 0
+    recorder = Recorder()
+    # registry the forward hook
+    for name, module in model.named_modules():
+        if name in source:
+            count += 1
+            module.register_forward_hook(recorder.record_data_hook)
+            if count == len(source):
+                break
+
+    with recorder:
+        # test a single image, and record feature map to data_buffer
+        result = inference_model(model, args.img)
+
+    visualize(args, model, recorder, result)
+
+
+if __name__ == '__main__':
+    main()
+
+```
+
+将上述代码保存为 feature_map_visual.py，在终端执行如下代码
+
+```shell
+python feature_map_visual.py ${图像} ${配置文件} ${检查点文件} [可选参数]
+```
+
+样例
+
+```shell
+python feature_map_visual.py \
+aachen_000000_000019_leftImg8bit.png \
+configs/ann/ann_r50-d8_4xb2-40k_cityscapes-512x1024.py \
+ann_r50-d8_512x1024_40k_cityscapes_20200605_095211-049fc292.pth \
+--gt_mask aachen_000000_000019_gtFine_labelTrainIds.png
+```
+
+可视化后的图像结果和它的对应的 feature map图像会出现在wandb账户中
+
+<div align=center>
+<img src="https://user-images.githubusercontent.com/76149310/217520321-647f5bf9-eef2-446d-a9e8-5ca7b621d500.png">
+</div>
diff --git a/mmseg/__init__.py b/mmseg/__init__.py
index 360abfc857..5fcb84e8c4 100644
--- a/mmseg/__init__.py
+++ b/mmseg/__init__.py
@@ -2,12 +2,15 @@
 import warnings
 
 import mmcv
+import mmengine
 from packaging.version import parse
 
 from .version import __version__, version_info
 
-MMCV_MIN = '1.3.13'
-MMCV_MAX = '1.6.0'
+MMCV_MIN = '2.0.0rc4'
+MMCV_MAX = '2.2.0'
+MMENGINE_MIN = '0.5.0'
+MMENGINE_MAX = '1.0.0'
 
 
 def digit_version(version_str: str, length: int = 4):
@@ -55,8 +58,17 @@ def digit_version(version_str: str, length: int = 4):
 mmcv_version = digit_version(mmcv.__version__)
 
 
-assert (mmcv_min_version <= mmcv_version <= mmcv_max_version), \
+assert (mmcv_min_version <= mmcv_version < mmcv_max_version), \
     f'MMCV=={mmcv.__version__} is used but incompatible. ' \
-    f'Please install mmcv>={mmcv_min_version}, <={mmcv_max_version}.'
+    f'Please install mmcv>=2.0.0rc4.'
+
+mmengine_min_version = digit_version(MMENGINE_MIN)
+mmengine_max_version = digit_version(MMENGINE_MAX)
+mmengine_version = digit_version(mmengine.__version__)
+
+assert (mmengine_min_version <= mmengine_version < mmengine_max_version), \
+    f'MMEngine=={mmengine.__version__} is used but incompatible. ' \
+    f'Please install mmengine>={mmengine_min_version}, '\
+    f'<{mmengine_max_version}.'
 
 __all__ = ['__version__', 'version_info', 'digit_version']
diff --git a/mmseg/apis/__init__.py b/mmseg/apis/__init__.py
index 9933b99b3c..b50a266319 100644
--- a/mmseg/apis/__init__.py
+++ b/mmseg/apis/__init__.py
@@ -1,4 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .inference import inference_model, init_model, show_result_pyplot
+from .mmseg_inferencer import MMSegInferencer
+from .remote_sense_inferencer import RSImage, RSInferencer
 
-__all__ = ['init_model', 'inference_model', 'show_result_pyplot']
+__all__ = [
+    'init_model', 'inference_model', 'show_result_pyplot', 'MMSegInferencer',
+    'RSInferencer', 'RSImage'
+]
diff --git a/mmseg/apis/inference.py b/mmseg/apis/inference.py
index bdbae1d0cc..aab11d14f4 100644
--- a/mmseg/apis/inference.py
+++ b/mmseg/apis/inference.py
@@ -1,73 +1,101 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import matplotlib.pyplot as plt
+import warnings
+from pathlib import Path
+from typing import Optional, Union
+
 import mmcv
+import numpy as np
 import torch
-from mmcv.parallel import collate, scatter
-from mmcv.runner import load_checkpoint
-
-from mmseg.datasets.transforms import Compose
-from mmseg.models import build_segmentor
-
-
-def init_model(config, checkpoint=None, device='cuda:0'):
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+from mmengine.utils import mkdir_or_exist
+
+from mmseg.models import BaseSegmentor
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import SampleList, dataset_aliases, get_classes, get_palette
+from mmseg.visualization import SegLocalVisualizer
+from .utils import ImageType, _preprare_data
+
+
+def init_model(config: Union[str, Path, Config],
+               checkpoint: Optional[str] = None,
+               device: str = 'cuda:0',
+               cfg_options: Optional[dict] = None):
     """Initialize a segmentor from config file.
 
     Args:
-        config (str or :obj:`mmcv.Config`): Config file path or the config
-            object.
+        config (str, :obj:`Path`, or :obj:`mmengine.Config`): Config file path,
+            :obj:`Path`, or the config object.
         checkpoint (str, optional): Checkpoint path. If left as None, the model
             will not load any weights.
         device (str, optional) CPU/CUDA device option. Default 'cuda:0'.
             Use 'cpu' for loading model on CPU.
+        cfg_options (dict, optional): Options to override some settings in
+            the used config.
     Returns:
         nn.Module: The constructed segmentor.
     """
-    if isinstance(config, str):
-        config = mmcv.Config.fromfile(config)
-    elif not isinstance(config, mmcv.Config):
+    if isinstance(config, (str, Path)):
+        config = Config.fromfile(config)
+    elif not isinstance(config, Config):
         raise TypeError('config must be a filename or Config object, '
                         'but got {}'.format(type(config)))
+    if cfg_options is not None:
+        config.merge_from_dict(cfg_options)
+    if config.model.type == 'EncoderDecoder':
+        if 'init_cfg' in config.model.backbone:
+            config.model.backbone.init_cfg = None
+    elif config.model.type == 'MultimodalEncoderDecoder':
+        for k, v in config.model.items():
+            if isinstance(v, dict) and 'init_cfg' in v:
+                config.model[k].init_cfg = None
     config.model.pretrained = None
     config.model.train_cfg = None
-    model = build_segmentor(config.model, test_cfg=config.get('test_cfg'))
+    init_default_scope(config.get('default_scope', 'mmseg'))
+
+    model = MODELS.build(config.model)
     if checkpoint is not None:
         checkpoint = load_checkpoint(model, checkpoint, map_location='cpu')
-        model.CLASSES = checkpoint['meta']['CLASSES']
-        model.PALETTE = checkpoint['meta']['PALETTE']
+        dataset_meta = checkpoint['meta'].get('dataset_meta', None)
+        # save the dataset_meta in the model for convenience
+        if 'dataset_meta' in checkpoint.get('meta', {}):
+            # mmseg 1.x
+            model.dataset_meta = dataset_meta
+        elif 'CLASSES' in checkpoint.get('meta', {}):
+            # < mmseg 1.x
+            classes = checkpoint['meta']['CLASSES']
+            palette = checkpoint['meta']['PALETTE']
+            model.dataset_meta = {'classes': classes, 'palette': palette}
+        else:
+            warnings.simplefilter('once')
+            warnings.warn(
+                'dataset_meta or class names are not saved in the '
+                'checkpoint\'s meta data, classes and palette will be'
+                'set according to num_classes ')
+            num_classes = model.decode_head.num_classes
+            dataset_name = None
+            for name in dataset_aliases.keys():
+                if len(get_classes(name)) == num_classes:
+                    dataset_name = name
+                    break
+            if dataset_name is None:
+                warnings.warn(
+                    'No suitable dataset found, use Cityscapes by default')
+                dataset_name = 'cityscapes'
+            model.dataset_meta = {
+                'classes': get_classes(dataset_name),
+                'palette': get_palette(dataset_name)
+            }
     model.cfg = config  # save the config in the model for convenience
     model.to(device)
     model.eval()
     return model
 
 
-class LoadImage:
-    """A simple pipeline to load image."""
-
-    def __call__(self, results):
-        """Call function to load images into results.
-
-        Args:
-            results (dict): A result dict contains the file name
-                of the image to be read.
-
-        Returns:
-            dict: ``results`` will be returned containing loaded image.
-        """
-
-        if isinstance(results['img'], str):
-            results['filename'] = results['img']
-            results['ori_filename'] = results['img']
-        else:
-            results['filename'] = None
-            results['ori_filename'] = None
-        img = mmcv.imread(results['img'])
-        results['img'] = img
-        results['img_shape'] = img.shape
-        results['ori_shape'] = img.shape
-        return results
-
-
-def inference_model(model, img):
+def inference_model(model: BaseSegmentor,
+                    img: ImageType) -> Union[SegDataSample, SampleList]:
     """Inference image(s) with the segmentor.
 
     Args:
@@ -76,61 +104,86 @@ def inference_model(model, img):
             images.
 
     Returns:
-        (list[Tensor]): The segmentation result.
+        :obj:`SegDataSample` or list[:obj:`SegDataSample`]:
+        If imgs is a list or tuple, the same length list type results
+        will be returned, otherwise return the segmentation results directly.
     """
-    cfg = model.cfg
-    device = next(model.parameters()).device  # model device
-    # build the data pipeline
-    test_pipeline = [LoadImage()] + cfg.data.test.pipeline[1:]
-    test_pipeline = Compose(test_pipeline)
     # prepare data
-    data = dict(img=img)
-    data = test_pipeline(data)
-    data = collate([data], samples_per_gpu=1)
-    if next(model.parameters()).is_cuda:
-        # scatter to specified GPU
-        data = scatter(data, [device])[0]
-    else:
-        data['img_metas'] = [i.data[0] for i in data['img_metas']]
+    data, is_batch = _preprare_data(img, model)
 
     # forward the model
     with torch.no_grad():
-        result = model(return_loss=False, rescale=True, **data)
-    return result
-
-
-def show_result_pyplot(model,
-                       img,
-                       result,
-                       palette=None,
-                       fig_size=(15, 10),
-                       opacity=0.5,
-                       title='',
-                       block=True):
+        results = model.test_step(data)
+
+    return results if is_batch else results[0]
+
+
+def show_result_pyplot(model: BaseSegmentor,
+                       img: Union[str, np.ndarray],
+                       result: SegDataSample,
+                       opacity: float = 0.5,
+                       title: str = '',
+                       draw_gt: bool = True,
+                       draw_pred: bool = True,
+                       wait_time: float = 0,
+                       show: bool = True,
+                       with_labels: Optional[bool] = True,
+                       save_dir=None,
+                       out_file=None):
     """Visualize the segmentation results on the image.
 
     Args:
         model (nn.Module): The loaded segmentor.
         img (str or np.ndarray): Image filename or loaded image.
-        result (list): The segmentation result.
-        palette (list[list[int]]] | None): The palette of segmentation
-            map. If None is given, random palette will be generated.
-            Default: None
-        fig_size (tuple): Figure size of the pyplot figure.
+        result (SegDataSample): The prediction SegDataSample result.
         opacity(float): Opacity of painted segmentation map.
-            Default 0.5.
-            Must be in (0, 1] range.
+            Default 0.5. Must be in (0, 1] range.
         title (str): The title of pyplot figure.
             Default is ''.
-        block (bool): Whether to block the pyplot figure.
-            Default is True.
+        draw_gt (bool): Whether to draw GT SegDataSample. Default to True.
+        draw_pred (bool): Whether to draw Prediction SegDataSample.
+            Defaults to True.
+        wait_time (float): The interval of show (s). 0 is the special value
+            that means "forever". Defaults to 0.
+        show (bool): Whether to display the drawn image.
+            Default to True.
+        with_labels(bool, optional): Add semantic labels in visualization
+            result, Default to True.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        out_file (str, optional): Path to output file. Default to None.
+
+
+
+    Returns:
+        np.ndarray: the drawn image which channel is RGB.
     """
     if hasattr(model, 'module'):
         model = model.module
-    img = model.show_result(
-        img, result, palette=palette, show=False, opacity=opacity)
-    plt.figure(figsize=fig_size)
-    plt.imshow(mmcv.bgr2rgb(img))
-    plt.title(title)
-    plt.tight_layout()
-    plt.show(block=block)
+    if isinstance(img, str):
+        image = mmcv.imread(img, channel_order='rgb')
+    else:
+        image = img
+    if save_dir is not None:
+        mkdir_or_exist(save_dir)
+    # init visualizer
+    visualizer = SegLocalVisualizer(
+        vis_backends=[dict(type='LocalVisBackend')],
+        save_dir=save_dir,
+        alpha=opacity)
+    visualizer.dataset_meta = dict(
+        classes=model.dataset_meta['classes'],
+        palette=model.dataset_meta['palette'])
+    visualizer.add_datasample(
+        name=title,
+        image=image,
+        data_sample=result,
+        draw_gt=draw_gt,
+        draw_pred=draw_pred,
+        wait_time=wait_time,
+        out_file=out_file,
+        show=show,
+        with_labels=with_labels)
+    vis_img = visualizer.get_image()
+
+    return vis_img
diff --git a/mmseg/apis/mmseg_inferencer.py b/mmseg/apis/mmseg_inferencer.py
new file mode 100644
index 0000000000..02a198b516
--- /dev/null
+++ b/mmseg/apis/mmseg_inferencer.py
@@ -0,0 +1,382 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import List, Optional, Sequence, Union
+
+import mmcv
+import mmengine
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.transforms import Compose
+from mmengine.infer.infer import BaseInferencer, ModelType
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner.checkpoint import _load_checkpoint_to_model
+from PIL import Image
+
+from mmseg.structures import SegDataSample
+from mmseg.utils import ConfigType, SampleList, get_classes, get_palette
+from mmseg.visualization import SegLocalVisualizer
+
+InputType = Union[str, np.ndarray]
+InputsType = Union[InputType, Sequence[InputType]]
+PredType = Union[SegDataSample, SampleList]
+
+
+class MMSegInferencer(BaseInferencer):
+    """Semantic segmentation inferencer, provides inference and visualization
+    interfaces. Note: MMEngine >= 0.5.0 is required.
+
+    Args:
+        model (str, optional): Path to the config file or the model name
+            defined in metafile. Take the `mmseg metafile <https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/metafile.yaml>`_
+            as an example the `model` could be
+            "fcn_r50-d8_4xb2-40k_cityscapes-512x1024", and the weights of model
+            will be download automatically. If use config file, like
+            "configs/fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py", the
+            `weights` should be defined.
+        weights (str, optional): Path to the checkpoint. If it is not specified
+            and model is a model name of metafile, the weights will be loaded
+            from metafile. Defaults to None.
+        classes (list, optional): Input classes for result rendering, as the
+            prediction of segmentation model is a segment map with label
+            indices, `classes` is a list which includes items responding to the
+            label indices. If classes is not defined, visualizer will take
+            `cityscapes` classes by default. Defaults to None.
+        palette (list, optional): Input palette for result rendering, which is
+            a list of color palette responding to the classes. If palette is
+            not defined, visualizer will take `cityscapes` palette by default.
+            Defaults to None.
+        dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+            visulizer will use the meta information of the dataset i.e. classes
+            and palette, but the `classes` and `palette` have higher priority.
+            Defaults to None.
+        device (str, optional): Device to run inference. If None, the available
+            device will be automatically used. Defaults to None.
+        scope (str, optional): The scope of the model. Defaults to 'mmseg'.
+    """ # noqa
+
+    preprocess_kwargs: set = set()
+    forward_kwargs: set = {'mode', 'out_dir'}
+    visualize_kwargs: set = {
+        'show', 'wait_time', 'img_out_dir', 'opacity', 'return_vis',
+        'with_labels'
+    }
+    postprocess_kwargs: set = {'pred_out_dir', 'return_datasample'}
+
+    def __init__(self,
+                 model: Union[ModelType, str],
+                 weights: Optional[str] = None,
+                 classes: Optional[Union[str, List]] = None,
+                 palette: Optional[Union[str, List]] = None,
+                 dataset_name: Optional[str] = None,
+                 device: Optional[str] = None,
+                 scope: Optional[str] = 'mmseg') -> None:
+        # A global counter tracking the number of images processes, for
+        # naming of the output images
+        self.num_visualized_imgs = 0
+        self.num_pred_imgs = 0
+        init_default_scope(scope if scope else 'mmseg')
+        super().__init__(
+            model=model, weights=weights, device=device, scope=scope)
+
+        if device == 'cpu' or not torch.cuda.is_available():
+            self.model = revert_sync_batchnorm(self.model)
+
+        assert isinstance(self.visualizer, SegLocalVisualizer)
+        self.visualizer.set_dataset_meta(classes, palette, dataset_name)
+
+    def _load_weights_to_model(self, model: nn.Module,
+                               checkpoint: Optional[dict],
+                               cfg: Optional[ConfigType]) -> None:
+        """Loading model weights and meta information from cfg and checkpoint.
+
+        Subclasses could override this method to load extra meta information
+        from ``checkpoint`` and ``cfg`` to model.
+
+        Args:
+            model (nn.Module): Model to load weights and meta information.
+            checkpoint (dict, optional): The loaded checkpoint.
+            cfg (Config or ConfigDict, optional): The loaded config.
+        """
+
+        if checkpoint is not None:
+            _load_checkpoint_to_model(model, checkpoint)
+            checkpoint_meta = checkpoint.get('meta', {})
+            # save the dataset_meta in the model for convenience
+            if 'dataset_meta' in checkpoint_meta:
+                # mmsegmentation 1.x
+                model.dataset_meta = {
+                    'classes': checkpoint_meta['dataset_meta'].get('classes'),
+                    'palette': checkpoint_meta['dataset_meta'].get('palette')
+                }
+            elif 'CLASSES' in checkpoint_meta:
+                # mmsegmentation 0.x
+                classes = checkpoint_meta['CLASSES']
+                palette = checkpoint_meta.get('PALETTE', None)
+                model.dataset_meta = {'classes': classes, 'palette': palette}
+            else:
+                warnings.warn(
+                    'dataset_meta or class names are not saved in the '
+                    'checkpoint\'s meta data, use classes of Cityscapes by '
+                    'default.')
+                model.dataset_meta = {
+                    'classes': get_classes('cityscapes'),
+                    'palette': get_palette('cityscapes')
+                }
+        else:
+            warnings.warn('Checkpoint is not loaded, and the inference '
+                          'result is calculated by the randomly initialized '
+                          'model!')
+            warnings.warn(
+                'weights is None, use cityscapes classes by default.')
+            model.dataset_meta = {
+                'classes': get_classes('cityscapes'),
+                'palette': get_palette('cityscapes')
+            }
+
+    def __call__(self,
+                 inputs: InputsType,
+                 return_datasamples: bool = False,
+                 batch_size: int = 1,
+                 return_vis: bool = False,
+                 show: bool = False,
+                 wait_time: int = 0,
+                 out_dir: str = '',
+                 img_out_dir: str = 'vis',
+                 pred_out_dir: str = 'pred',
+                 **kwargs) -> dict:
+        """Call the inferencer.
+
+        Args:
+            inputs (Union[list, str, np.ndarray]): Inputs for the inferencer.
+            return_datasamples (bool): Whether to return results as
+                :obj:`SegDataSample`. Defaults to False.
+            batch_size (int): Batch size. Defaults to 1.
+            show (bool): Whether to display the rendering color segmentation
+                mask in a popup window. Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_dir (str): Output directory of inference results. Defaults
+                to ''.
+            img_out_dir (str): Subdirectory of `out_dir`, used to save
+                rendering color segmentation mask, so `out_dir` must be defined
+                if you would like to save predicted mask. Defaults to 'vis'.
+            pred_out_dir (str): Subdirectory of `out_dir`, used to save
+                predicted mask file, so `out_dir` must be defined if you would
+                like to save predicted mask. Defaults to 'pred'.
+
+            **kwargs: Other keyword arguments passed to :meth:`preprocess`,
+                :meth:`forward`, :meth:`visualize` and :meth:`postprocess`.
+                Each key in kwargs should be in the corresponding set of
+                ``preprocess_kwargs``, ``forward_kwargs``, ``visualize_kwargs``
+                and ``postprocess_kwargs``.
+
+
+        Returns:
+            dict: Inference and visualization results.
+        """
+
+        if out_dir != '':
+            pred_out_dir = osp.join(out_dir, pred_out_dir)
+            img_out_dir = osp.join(out_dir, img_out_dir)
+        else:
+            pred_out_dir = ''
+            img_out_dir = ''
+
+        return super().__call__(
+            inputs=inputs,
+            return_datasamples=return_datasamples,
+            batch_size=batch_size,
+            show=show,
+            wait_time=wait_time,
+            img_out_dir=img_out_dir,
+            pred_out_dir=pred_out_dir,
+            return_vis=return_vis,
+            **kwargs)
+
+    def visualize(self,
+                  inputs: list,
+                  preds: List[dict],
+                  return_vis: bool = False,
+                  show: bool = False,
+                  wait_time: int = 0,
+                  img_out_dir: str = '',
+                  opacity: float = 0.8,
+                  with_labels: Optional[bool] = True) -> List[np.ndarray]:
+        """Visualize predictions.
+
+        Args:
+            inputs (list): Inputs preprocessed by :meth:`_inputs_to_list`.
+            preds (Any): Predictions of the model.
+            show (bool): Whether to display the image in a popup window.
+                Defaults to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            img_out_dir (str): Output directory of rendering prediction i.e.
+                color segmentation mask. Defaults: ''
+            opacity (int, float): The transparency of segmentation mask.
+                Defaults to 0.8.
+
+        Returns:
+            List[np.ndarray]: Visualization results.
+        """
+        if not show and img_out_dir == '' and not return_vis:
+            return None
+        if self.visualizer is None:
+            raise ValueError('Visualization needs the "visualizer" term'
+                             'defined in the config, but got None.')
+
+        self.visualizer.set_dataset_meta(**self.model.dataset_meta)
+        self.visualizer.alpha = opacity
+
+        results = []
+
+        for single_input, pred in zip(inputs, preds):
+            if isinstance(single_input, str):
+                img_bytes = mmengine.fileio.get(single_input)
+                img = mmcv.imfrombytes(img_bytes)
+                img = img[:, :, ::-1]
+                img_name = osp.basename(single_input)
+            elif isinstance(single_input, np.ndarray):
+                img = single_input.copy()
+                img_num = str(self.num_visualized_imgs).zfill(8) + '_vis'
+                img_name = f'{img_num}.jpg'
+            else:
+                raise ValueError('Unsupported input type:'
+                                 f'{type(single_input)}')
+
+            out_file = osp.join(img_out_dir, img_name) if img_out_dir != ''\
+                else None
+
+            self.visualizer.add_datasample(
+                img_name,
+                img,
+                pred,
+                show=show,
+                wait_time=wait_time,
+                draw_gt=False,
+                draw_pred=True,
+                out_file=out_file,
+                with_labels=with_labels)
+            if return_vis:
+                results.append(self.visualizer.get_image())
+            self.num_visualized_imgs += 1
+
+        return results if return_vis else None
+
+    def postprocess(self,
+                    preds: PredType,
+                    visualization: List[np.ndarray],
+                    return_datasample: bool = False,
+                    pred_out_dir: str = '') -> dict:
+        """Process the predictions and visualization results from ``forward``
+        and ``visualize``.
+
+        This method should be responsible for the following tasks:
+
+        1. Pack the predictions and visualization results and return them.
+        2. Save the predictions, if it needed.
+
+        Args:
+            preds (List[Dict]): Predictions of the model.
+            visualization (List[np.ndarray]): The list of rendering color
+                segmentation mask.
+            return_datasample (bool): Whether to return results as datasamples.
+                Defaults to False.
+            pred_out_dir: File to save the inference results w/o
+                visualization. If left as empty, no file will be saved.
+                Defaults to ''.
+
+        Returns:
+            dict: Inference and visualization results with key ``predictions``
+            and ``visualization``
+
+            - ``visualization (Any)``: Returned by :meth:`visualize`
+            - ``predictions`` (List[np.ndarray], np.ndarray): Returned by
+              :meth:`forward` and processed in :meth:`postprocess`.
+              If ``return_datasample=False``, it will be the segmentation mask
+              with label indice.
+        """
+        if return_datasample:
+            if len(preds) == 1:
+                return preds[0]
+            else:
+                return preds
+
+        results_dict = {}
+
+        results_dict['predictions'] = []
+        results_dict['visualization'] = []
+
+        for i, pred in enumerate(preds):
+            pred_data = dict()
+            if 'pred_sem_seg' in pred.keys():
+                pred_data['sem_seg'] = pred.pred_sem_seg.numpy().data[0]
+            elif 'pred_depth_map' in pred.keys():
+                pred_data['depth_map'] = pred.pred_depth_map.numpy().data[0]
+
+            if visualization is not None:
+                vis = visualization[i]
+                results_dict['visualization'].append(vis)
+            if pred_out_dir != '':
+                mmengine.mkdir_or_exist(pred_out_dir)
+                for key, data in pred_data.items():
+                    post_fix = '_pred.png' if key == 'sem_seg' else '_pred.npy'
+                    img_name = str(self.num_pred_imgs).zfill(8) + post_fix
+                    img_path = osp.join(pred_out_dir, img_name)
+                    if key == 'sem_seg':
+                        output = Image.fromarray(data.astype(np.uint8))
+                        output.save(img_path)
+                    else:
+                        np.save(img_path, data)
+            pred_data = next(iter(pred_data.values()))
+            results_dict['predictions'].append(pred_data)
+            self.num_pred_imgs += 1
+
+        if len(results_dict['predictions']) == 1:
+            results_dict['predictions'] = results_dict['predictions'][0]
+            if visualization is not None:
+                results_dict['visualization'] = \
+                    results_dict['visualization'][0]
+        return results_dict
+
+    def _init_pipeline(self, cfg: ConfigType) -> Compose:
+        """Initialize the test pipeline.
+
+        Return a pipeline to handle various input data, such as ``str``,
+        ``np.ndarray``. It is an abstract method in BaseInferencer, and should
+        be implemented in subclasses.
+
+        The returned pipeline will be used to process a single data.
+        It will be used in :meth:`preprocess` like this:
+
+        .. code-block:: python
+            def preprocess(self, inputs, batch_size, **kwargs):
+                ...
+                dataset = map(self.pipeline, dataset)
+                ...
+        """
+        pipeline_cfg = cfg.test_dataloader.dataset.pipeline
+        # Loading annotations is also not applicable
+        for transform in ('LoadAnnotations', 'LoadDepthAnnotation'):
+            idx = self._get_transform_idx(pipeline_cfg, transform)
+            if idx != -1:
+                del pipeline_cfg[idx]
+
+        load_img_idx = self._get_transform_idx(pipeline_cfg,
+                                               'LoadImageFromFile')
+        if load_img_idx == -1:
+            raise ValueError(
+                'LoadImageFromFile is not found in the test pipeline')
+        pipeline_cfg[load_img_idx]['type'] = 'InferencerLoader'
+        return Compose(pipeline_cfg)
+
+    def _get_transform_idx(self, pipeline_cfg: ConfigType, name: str) -> int:
+        """Returns the index of the transform in a pipeline.
+
+        If the transform is not found, returns -1.
+        """
+        for i, transform in enumerate(pipeline_cfg):
+            if transform['type'] == name:
+                return i
+        return -1
diff --git a/mmseg/apis/remote_sense_inferencer.py b/mmseg/apis/remote_sense_inferencer.py
new file mode 100644
index 0000000000..6726c6ae34
--- /dev/null
+++ b/mmseg/apis/remote_sense_inferencer.py
@@ -0,0 +1,279 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import threading
+from queue import Queue
+from typing import List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine import Config
+from mmengine.model import BaseModel
+from mmengine.registry import init_default_scope
+from mmengine.runner import load_checkpoint
+
+try:
+    from osgeo import gdal
+except ImportError:
+    gdal = None
+
+from mmseg.registry import MODELS
+from .utils import _preprare_data
+
+
+class RSImage:
+    """Remote sensing image class.
+
+    Args:
+        img (str or gdal.Dataset): Image file path or gdal.Dataset.
+    """
+
+    def __init__(self, image):
+        self.dataset = gdal.Open(image, gdal.GA_ReadOnly) if isinstance(
+            image, str) else image
+        assert isinstance(self.dataset, gdal.Dataset), \
+            f'{image} is not a image'
+        self.width = self.dataset.RasterXSize
+        self.height = self.dataset.RasterYSize
+        self.channel = self.dataset.RasterCount
+        self.trans = self.dataset.GetGeoTransform()
+        self.proj = self.dataset.GetProjection()
+        self.band_list = []
+        self.band_list.extend(
+            self.dataset.GetRasterBand(c + 1) for c in range(self.channel))
+        self.grids = []
+
+    def read(self, grid: Optional[List] = None) -> np.ndarray:
+        """Read image data. If grid is None, read the whole image.
+
+        Args:
+            grid (Optional[List], optional): Grid to read. Defaults to None.
+        Returns:
+            np.ndarray: Image data.
+        """
+        if grid is None:
+            return np.einsum('ijk->jki', self.dataset.ReadAsArray())
+        assert len(
+            grid) >= 4, 'grid must be a list containing at least 4 elements'
+        data = self.dataset.ReadAsArray(*grid[:4])
+        if data.ndim == 2:
+            data = data[np.newaxis, ...]
+        return np.einsum('ijk->jki', data)
+
+    def write(self, data: Optional[np.ndarray], grid: Optional[List] = None):
+        """Write image data.
+
+        Args:
+            grid (Optional[List], optional): Grid to write. Defaults to None.
+            data (Optional[np.ndarray], optional): Data to write.
+                Defaults to None.
+
+        Raises:
+            ValueError: Either grid or data must be provided.
+        """
+        if grid is not None:
+            assert len(grid) == 8, 'grid must be a list of 8 elements'
+            for band in self.band_list:
+                band.WriteArray(
+                    data[grid[5]:grid[5] + grid[7], grid[4]:grid[4] + grid[6]],
+                    grid[0] + grid[4], grid[1] + grid[5])
+        elif data is not None:
+            for i in range(self.channel):
+                self.band_list[i].WriteArray(data[..., i])
+        else:
+            raise ValueError('Either grid or data must be provided.')
+
+    def create_seg_map(self, output_path: Optional[str] = None):
+        if output_path is None:
+            output_path = 'output_label.tif'
+        driver = gdal.GetDriverByName('GTiff')
+        seg_map = driver.Create(output_path, self.width, self.height, 1,
+                                gdal.GDT_Byte)
+        seg_map.SetGeoTransform(self.trans)
+        seg_map.SetProjection(self.proj)
+        seg_map_img = RSImage(seg_map)
+        seg_map_img.path = output_path
+        return seg_map_img
+
+    def create_grids(self,
+                     window_size: Tuple[int, int],
+                     stride: Tuple[int, int] = (0, 0)):
+        """Create grids for image inference.
+
+        Args:
+            window_size (Tuple[int, int]): the size of the sliding window.
+            stride (Tuple[int, int], optional): the stride of the sliding
+                window. Defaults to (0, 0).
+
+        Raises:
+            AssertionError: window_size must be a tuple of 2 elements.
+            AssertionError: stride must be a tuple of 2 elements.
+        """
+        assert len(
+            window_size) == 2, 'window_size must be a tuple of 2 elements'
+        assert len(stride) == 2, 'stride must be a tuple of 2 elements'
+        win_w, win_h = window_size
+        stride_x, stride_y = stride
+
+        stride_x = win_w if stride_x == 0 else stride_x
+        stride_y = win_h if stride_y == 0 else stride_y
+
+        x_half_overlap = (win_w - stride_x + 1) // 2
+        y_half_overlap = (win_h - stride_y + 1) // 2
+
+        for y in range(0, self.height, stride_y):
+            y_end = y + win_h >= self.height
+            y_offset = self.height - win_h if y_end else y
+            y_size = win_h
+            y_crop_off = 0 if y_offset == 0 else y_half_overlap
+            y_crop_size = y_size if y_end else win_h - y_crop_off
+
+            for x in range(0, self.width, stride_x):
+                x_end = x + win_w >= self.width
+                x_offset = self.width - win_w if x_end else x
+                x_size = win_w
+                x_crop_off = 0 if x_offset == 0 else x_half_overlap
+                x_crop_size = x_size if x_end else win_w - x_crop_off
+
+                self.grids.append([
+                    x_offset, y_offset, x_size, y_size, x_crop_off, y_crop_off,
+                    x_crop_size, y_crop_size
+                ])
+
+
+class RSInferencer:
+    """Remote sensing inference class.
+
+    Args:
+        model (BaseModel): The loaded model.
+        batch_size (int, optional): Batch size. Defaults to 1.
+        thread (int, optional): Number of threads. Defaults to 1.
+    """
+
+    def __init__(self, model: BaseModel, batch_size: int = 1, thread: int = 1):
+        self.model = model
+        self.batch_size = batch_size
+        self.END_FLAG = object()
+        self.read_buffer = Queue(self.batch_size)
+        self.write_buffer = Queue(self.batch_size)
+        self.thread = thread
+
+    @classmethod
+    def from_config_path(cls,
+                         config_path: str,
+                         checkpoint_path: str,
+                         batch_size: int = 1,
+                         thread: int = 1,
+                         device: Optional[str] = 'cpu'):
+        """Initialize a segmentor from config file.
+
+        Args:
+            config_path (str): Config file path.
+            checkpoint_path (str): Checkpoint path.
+            batch_size (int, optional): Batch size. Defaults to 1.
+        """
+        init_default_scope('mmseg')
+        cfg = Config.fromfile(config_path)
+        model = MODELS.build(cfg.model)
+        model.cfg = cfg
+        load_checkpoint(model, checkpoint_path, map_location='cpu')
+        model.to(device)
+        model.eval()
+        return cls(model, batch_size, thread)
+
+    @classmethod
+    def from_model(cls,
+                   model: BaseModel,
+                   checkpoint_path: Optional[str] = None,
+                   batch_size: int = 1,
+                   thread: int = 1,
+                   device: Optional[str] = 'cpu'):
+        """Initialize a segmentor from model.
+
+        Args:
+            model (BaseModel): The loaded model.
+            checkpoint_path (Optional[str]): Checkpoint path.
+            batch_size (int, optional): Batch size. Defaults to 1.
+        """
+        if checkpoint_path is not None:
+            load_checkpoint(model, checkpoint_path, map_location='cpu')
+        model.to(device)
+        return cls(model, batch_size, thread)
+
+    def read(self,
+             image: RSImage,
+             window_size: Tuple[int, int],
+             strides: Tuple[int, int] = (0, 0)):
+        """Load image data to read buffer.
+
+        Args:
+            image (RSImage): The image to read.
+            window_size (Tuple[int, int]): The size of the sliding window.
+            strides (Tuple[int, int], optional): The stride of the sliding
+                window. Defaults to (0, 0).
+        """
+        image.create_grids(window_size, strides)
+        for grid in image.grids:
+            self.read_buffer.put([grid, image.read(grid=grid)])
+        self.read_buffer.put(self.END_FLAG)
+
+    def inference(self):
+        """Inference image data from read buffer and put the result to write
+        buffer."""
+        while True:
+            item = self.read_buffer.get()
+            if item == self.END_FLAG:
+                self.read_buffer.put(self.END_FLAG)
+                self.write_buffer.put(item)
+                break
+            data, _ = _preprare_data(item[1], self.model)
+            with torch.no_grad():
+                result = self.model.test_step(data)
+            item[1] = result[0].pred_sem_seg.cpu().data.numpy()[0]
+            self.write_buffer.put(item)
+            self.read_buffer.task_done()
+
+    def write(self, image: RSImage, output_path: Optional[str] = None):
+        """Write image data from write buffer.
+
+        Args:
+            image (RSImage): The image to write.
+            output_path (Optional[str], optional): The path to save the
+                segmentation map. Defaults to None.
+        """
+        seg_map = image.create_seg_map(output_path)
+        while True:
+            item = self.write_buffer.get()
+            if item == self.END_FLAG:
+                break
+            seg_map.write(data=item[1], grid=item[0])
+            self.write_buffer.task_done()
+
+    def run(self,
+            image: RSImage,
+            window_size: Tuple[int, int],
+            strides: Tuple[int, int] = (0, 0),
+            output_path: Optional[str] = None):
+        """Run inference with multi-threading.
+
+        Args:
+            image (RSImage): The image to inference.
+            window_size (Tuple[int, int]): The size of the sliding window.
+            strides (Tuple[int, int], optional): The stride of the sliding
+                window. Defaults to (0, 0).
+            output_path (Optional[str], optional): The path to save the
+                segmentation map. Defaults to None.
+        """
+        read_thread = threading.Thread(
+            target=self.read, args=(image, window_size, strides))
+        read_thread.start()
+        inference_threads = []
+        for _ in range(self.thread):
+            inference_thread = threading.Thread(target=self.inference)
+            inference_thread.start()
+            inference_threads.append(inference_thread)
+        write_thread = threading.Thread(
+            target=self.write, args=(image, output_path))
+        write_thread.start()
+        read_thread.join()
+        for inference_thread in inference_threads:
+            inference_thread.join()
+        write_thread.join()
diff --git a/mmseg/apis/utils.py b/mmseg/apis/utils.py
new file mode 100644
index 0000000000..4cf8775660
--- /dev/null
+++ b/mmseg/apis/utils.py
@@ -0,0 +1,41 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import defaultdict
+from typing import Sequence, Union
+
+import numpy as np
+from mmengine.dataset import Compose
+from mmengine.model import BaseModel
+
+ImageType = Union[str, np.ndarray, Sequence[str], Sequence[np.ndarray]]
+
+
+def _preprare_data(imgs: ImageType, model: BaseModel):
+
+    cfg = model.cfg
+    for t in cfg.test_pipeline:
+        if t.get('type') == 'LoadAnnotations':
+            cfg.test_pipeline.remove(t)
+
+    is_batch = True
+    if not isinstance(imgs, (list, tuple)):
+        imgs = [imgs]
+        is_batch = False
+
+    if isinstance(imgs[0], np.ndarray):
+        cfg.test_pipeline[0]['type'] = 'LoadImageFromNDArray'
+
+    # TODO: Consider using the singleton pattern to avoid building
+    # a pipeline for each inference
+    pipeline = Compose(cfg.test_pipeline)
+
+    data = defaultdict(list)
+    for img in imgs:
+        if isinstance(img, np.ndarray):
+            data_ = dict(img=img)
+        else:
+            data_ = dict(img_path=img)
+        data_ = pipeline(data_)
+        data['inputs'].append(data_['inputs'])
+        data['data_samples'].append(data_['data_samples'])
+
+    return data, is_batch
diff --git a/mmseg/datasets/__init__.py b/mmseg/datasets/__init__.py
index 626f0e3982..d29bcc1126 100644
--- a/mmseg/datasets/__init__.py
+++ b/mmseg/datasets/__init__.py
@@ -1,30 +1,66 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmengine.dataset import ConcatDataset, RepeatDataset
-
-from mmseg.registry import DATASETS, TRANSFORMS
+# yapf: disable
 from .ade import ADE20KDataset
+from .basesegdataset import BaseCDDataset, BaseSegDataset
+from .bdd100k import BDD100KDataset
 from .chase_db1 import ChaseDB1Dataset
 from .cityscapes import CityscapesDataset
 from .coco_stuff import COCOStuffDataset
-from .custom import CustomDataset
 from .dark_zurich import DarkZurichDataset
 from .dataset_wrappers import MultiImageMixDataset
+from .decathlon import DecathlonDataset
 from .drive import DRIVEDataset
+from .dsdl import DSDLSegDataset
 from .hrf import HRFDataset
 from .isaid import iSAIDDataset
 from .isprs import ISPRSDataset
+from .levir import LEVIRCDDataset
+from .lip import LIPDataset
 from .loveda import LoveDADataset
+from .mapillary import MapillaryDataset_v1, MapillaryDataset_v2
 from .night_driving import NightDrivingDataset
+from .nyu import NYUDataset
 from .pascal_context import PascalContextDataset, PascalContextDataset59
 from .potsdam import PotsdamDataset
+from .refuge import REFUGEDataset
 from .stare import STAREDataset
+from .synapse import SynapseDataset
+# yapf: disable
+from .transforms import (CLAHE, AdjustGamma, Albu, BioMedical3DPad,
+                         BioMedical3DRandomCrop, BioMedical3DRandomFlip,
+                         BioMedicalGaussianBlur, BioMedicalGaussianNoise,
+                         BioMedicalRandomGamma, ConcatCDInput, GenerateEdge,
+                         LoadAnnotations, LoadBiomedicalAnnotation,
+                         LoadBiomedicalData, LoadBiomedicalImageFromFile,
+                         LoadImageFromNDArray, LoadMultipleRSImageFromFile,
+                         LoadSingleRSImageFromFile, PackSegInputs,
+                         PhotoMetricDistortion, RandomCrop, RandomCutOut,
+                         RandomMosaic, RandomRotate, RandomRotFlip, Rerange,
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
 from .voc import PascalVOCDataset
+from .zero_mould_v1 import ZeroMouldV1Dataset
+from .zero_mould_v2 import ZeroMouldV2Dataset
 
+# yapf: enable
 __all__ = [
-    'CustomDataset', 'ConcatDataset', 'RepeatDataset', 'DATASETS',
-    'TRANSFORMS', 'CityscapesDataset', 'PascalVOCDataset', 'ADE20KDataset',
+    'BaseSegDataset', 'BioMedical3DRandomCrop', 'BioMedical3DRandomFlip',
+    'CityscapesDataset', 'PascalVOCDataset', 'ADE20KDataset',
     'PascalContextDataset', 'PascalContextDataset59', 'ChaseDB1Dataset',
     'DRIVEDataset', 'HRFDataset', 'STAREDataset', 'DarkZurichDataset',
     'NightDrivingDataset', 'COCOStuffDataset', 'LoveDADataset',
-    'MultiImageMixDataset', 'iSAIDDataset', 'ISPRSDataset', 'PotsdamDataset'
+    'MultiImageMixDataset', 'iSAIDDataset', 'ISPRSDataset', 'PotsdamDataset',
+    'LoadAnnotations', 'RandomCrop', 'SegRescale', 'PhotoMetricDistortion',
+    'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray',
+    'RandomCutOut', 'RandomMosaic', 'PackSegInputs', 'ResizeToMultiple',
+    'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
+    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
+    'DecathlonDataset', 'LIPDataset', 'ResizeShortestEdge',
+    'BioMedicalGaussianNoise', 'BioMedicalGaussianBlur',
+    'BioMedicalRandomGamma', 'BioMedical3DPad', 'RandomRotFlip',
+    'SynapseDataset', 'REFUGEDataset', 'MapillaryDataset_v1',
+    'MapillaryDataset_v2', 'Albu', 'LEVIRCDDataset',
+    'LoadMultipleRSImageFromFile', 'LoadSingleRSImageFromFile',
+    'ConcatCDInput', 'BaseCDDataset', 'DSDLSegDataset', 'BDD100KDataset',
+    'NYUDataset'
 ]
diff --git a/mmseg/datasets/ade.py b/mmseg/datasets/ade.py
index 740bd1dd2e..e9bdae7421 100644
--- a/mmseg/datasets/ade.py
+++ b/mmseg/datasets/ade.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class ADE20KDataset(CustomDataset):
+class ADE20KDataset(BaseSegDataset):
     """ADE20K dataset.
 
     In segmentation map annotation for ADE20K, 0 stands for background, which
@@ -80,9 +80,13 @@ class ADE20KDataset(CustomDataset):
                  [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
                  [102, 255, 0], [92, 0, 255]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
diff --git a/mmseg/datasets/basesegdataset.py b/mmseg/datasets/basesegdataset.py
new file mode 100644
index 0000000000..9c4668c1f5
--- /dev/null
+++ b/mmseg/datasets/basesegdataset.py
@@ -0,0 +1,552 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import Callable, Dict, List, Optional, Sequence, Union
+
+import mmengine
+import mmengine.fileio as fileio
+import numpy as np
+from mmengine.dataset import BaseDataset, Compose
+
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BaseSegDataset(BaseDataset):
+    """Custom dataset for semantic segmentation. An example of file structure
+    is as followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(img_path='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 ignore_index: int = 255,
+                 reduce_zero_label: bool = False,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.seg_map_suffix = seg_map_suffix
+        self.ignore_index = ignore_index
+        self.reduce_zero_label = reduce_zero_label
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(new_classes)
+        self._metainfo.update(
+            dict(
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                if new_id != 255:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if not osp.isdir(self.ann_file) and self.ann_file:
+            assert osp.isfile(self.ann_file), \
+                f'Failed to load `ann_file` {self.ann_file}'
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix))
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+        else:
+            _suffix_len = len(self.img_suffix)
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                data_info = dict(img_path=osp.join(img_dir, img))
+                if ann_dir is not None:
+                    seg_map = img[:-_suffix_len] + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
+
+
+@DATASETS.register_module()
+class BaseCDDataset(BaseDataset):
+    """Custom dataset for change detection. An example of file structure is as
+    followed.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── my_dataset
+        │   │   ├── img_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── img_dir2
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{img_suffix}
+        │   │   │   │   ├── yyy{img_suffix}
+        │   │   │   │   ├── zzz{img_suffix}
+        │   │   │   ├── val
+        │   │   ├── ann_dir
+        │   │   │   ├── train
+        │   │   │   │   ├── xxx{seg_map_suffix}
+        │   │   │   │   ├── yyy{seg_map_suffix}
+        │   │   │   │   ├── zzz{seg_map_suffix}
+        │   │   │   ├── val
+
+    The image names in img_dir and img_dir2 should be consistent.
+    The img/gt_semantic_seg pair of BaseSegDataset should be of the same
+    except suffix. A valid img/gt_semantic_seg filename pair should be like
+    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
+    in the suffix). If split is given, then ``xxx`` is specified in txt file.
+    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
+    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
+
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path=None, img_path2=None, seg_map_path=None).
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        img_suffix2 (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO: dict = dict()
+
+    def __init__(self,
+                 ann_file: str = '',
+                 img_suffix='.jpg',
+                 img_suffix2='.jpg',
+                 seg_map_suffix='.png',
+                 metainfo: Optional[dict] = None,
+                 data_root: Optional[str] = None,
+                 data_prefix: dict = dict(
+                     img_path='', img_path2='', seg_map_path=''),
+                 filter_cfg: Optional[dict] = None,
+                 indices: Optional[Union[int, Sequence[int]]] = None,
+                 serialize_data: bool = True,
+                 pipeline: List[Union[dict, Callable]] = [],
+                 test_mode: bool = False,
+                 lazy_init: bool = False,
+                 max_refetch: int = 1000,
+                 ignore_index: int = 255,
+                 reduce_zero_label: bool = False,
+                 backend_args: Optional[dict] = None) -> None:
+
+        self.img_suffix = img_suffix
+        self.img_suffix2 = img_suffix2
+        self.seg_map_suffix = seg_map_suffix
+        self.ignore_index = ignore_index
+        self.reduce_zero_label = reduce_zero_label
+        self.backend_args = backend_args.copy() if backend_args else None
+
+        self.data_root = data_root
+        self.data_prefix = copy.copy(data_prefix)
+        self.ann_file = ann_file
+        self.filter_cfg = copy.deepcopy(filter_cfg)
+        self._indices = indices
+        self.serialize_data = serialize_data
+        self.test_mode = test_mode
+        self.max_refetch = max_refetch
+        self.data_list: List[dict] = []
+        self.data_bytes: np.ndarray
+
+        # Set meta information.
+        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
+
+        # Get label map for custom classes
+        new_classes = self._metainfo.get('classes', None)
+        self.label_map = self.get_label_map(new_classes)
+        self._metainfo.update(
+            dict(
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label))
+
+        # Update palette based on label map or generate palette
+        # if it is not defined
+        updated_palette = self._update_palette()
+        self._metainfo.update(dict(palette=updated_palette))
+
+        # Join paths.
+        if self.data_root is not None:
+            self._join_prefix()
+
+        # Build pipeline.
+        self.pipeline = Compose(pipeline)
+        # Full initialize the dataset.
+        if not lazy_init:
+            self.full_init()
+
+        if test_mode:
+            assert self._metainfo.get('classes') is not None, \
+                'dataset metainfo `classes` should be specified when testing'
+
+    @classmethod
+    def get_label_map(cls,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in cls.METAINFO
+        is not equal to new classes in self._metainfo and nether of them is not
+        None, `label_map` is not None.
+
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+
+
+        Returns:
+            dict, optional: The mapping from old classes in cls.METAINFO to
+                new classes in self._metainfo
+        """
+        old_classes = cls.METAINFO.get('classes', None)
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(cls.METAINFO['classes']):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in METAINFO.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
+
+    def _update_palette(self) -> list:
+        """Update palette after loading metainfo.
+
+        If length of palette is equal to classes, just return the palette.
+        If palette is not defined, it will randomly generate a palette.
+        If classes is updated by customer, it will return the subset of
+        palette.
+
+        Returns:
+            Sequence: Palette for current dataset.
+        """
+        palette = self._metainfo.get('palette', [])
+        classes = self._metainfo.get('classes', [])
+        # palette does match classes
+        if len(palette) == len(classes):
+            return palette
+
+        if len(palette) == 0:
+            # Get random state before set seed, and restore
+            # random state later.
+            # It will prevent loss of randomness, as the palette
+            # may be different in each iteration if not specified.
+            # See: https://github.com/open-mmlab/mmdetection/issues/5844
+            state = np.random.get_state()
+            np.random.seed(42)
+            # random palette
+            new_palette = np.random.randint(
+                0, 255, size=(len(classes), 3)).tolist()
+            np.random.set_state(state)
+        elif len(palette) >= len(classes) and self.label_map is not None:
+            new_palette = []
+            # return subset of palette
+            for old_id, new_id in sorted(
+                    self.label_map.items(), key=lambda x: x[1]):
+                if new_id != 255:
+                    new_palette.append(palette[old_id])
+            new_palette = type(palette)(new_palette)
+        else:
+            raise ValueError('palette does not match classes '
+                             f'as metainfo is {self._metainfo}.')
+        return new_palette
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        img_dir2 = self.data_prefix.get('img_path2', None)
+        ann_dir = self.data_prefix.get('seg_map_path', None)
+        if osp.isfile(self.ann_file):
+            lines = mmengine.list_from_file(
+                self.ann_file, backend_args=self.backend_args)
+            for line in lines:
+                img_name = line.strip()
+                if '.' in osp.basename(img_name):
+                    img_name, img_ext = osp.splitext(img_name)
+                    self.img_suffix = img_ext
+                    self.img_suffix2 = img_ext
+                data_info = dict(
+                    img_path=osp.join(img_dir, img_name + self.img_suffix),
+                    img_path2=osp.join(img_dir2, img_name + self.img_suffix2))
+
+                if ann_dir is not None:
+                    seg_map = img_name + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+        else:
+            for img in fileio.list_dir_or_file(
+                    dir_path=img_dir,
+                    list_dir=False,
+                    suffix=self.img_suffix,
+                    recursive=True,
+                    backend_args=self.backend_args):
+                if '.' in osp.basename(img):
+                    img, img_ext = osp.splitext(img)
+                    self.img_suffix = img_ext
+                    self.img_suffix2 = img_ext
+                data_info = dict(
+                    img_path=osp.join(img_dir, img + self.img_suffix),
+                    img_path2=osp.join(img_dir2, img + self.img_suffix2))
+                if ann_dir is not None:
+                    seg_map = img + self.seg_map_suffix
+                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
+                data_info['label_map'] = self.label_map
+                data_info['reduce_zero_label'] = self.reduce_zero_label
+                data_info['seg_fields'] = []
+                data_list.append(data_info)
+            data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/mmseg/datasets/bdd100k.py b/mmseg/datasets/bdd100k.py
new file mode 100644
index 0000000000..8ae70b5cb2
--- /dev/null
+++ b/mmseg/datasets/bdd100k.py
@@ -0,0 +1,30 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmseg.datasets.basesegdataset import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BDD100KDataset(BaseSegDataset):
+    METAINFO = dict(
+        classes=('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+                 'traffic light', 'traffic sign', 'vegetation', 'terrain',
+                 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train',
+                 'motorcycle', 'bicycle'),
+        palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+                 [190, 153, 153], [153, 153, 153], [250, 170,
+                                                    30], [220, 220, 0],
+                 [107, 142, 35], [152, 251, 152], [70, 130, 180],
+                 [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                 [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/mmseg/datasets/chase_db1.py b/mmseg/datasets/chase_db1.py
index 5ef2c3cb60..626ddf75e9 100644
--- a/mmseg/datasets/chase_db1.py
+++ b/mmseg/datasets/chase_db1.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
 
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class ChaseDB1Dataset(CustomDataset):
+class ChaseDB1Dataset(BaseSegDataset):
     """Chase_db1 dataset.
 
     In segmentation map annotation for Chase_db1, 0 stands for background,
@@ -17,10 +18,15 @@ class ChaseDB1Dataset(CustomDataset):
         classes=('background', 'vessel'),
         palette=[[120, 120, 120], [6, 230, 230]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_1stHO.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_1stHO.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
-        assert self.file_client.exists(self.data_prefix['img_list'])
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/mmseg/datasets/cityscapes.py b/mmseg/datasets/cityscapes.py
index c2caa8ccc3..f494d62424 100644
--- a/mmseg/datasets/cityscapes.py
+++ b/mmseg/datasets/cityscapes.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class CityscapesDataset(CustomDataset):
+class CityscapesDataset(BaseSegDataset):
     """Cityscapes dataset.
 
     The ``img_suffix`` is fixed to '_leftImg8bit.png' and ``seg_map_suffix`` is
diff --git a/mmseg/datasets/coco_stuff.py b/mmseg/datasets/coco_stuff.py
index b53408bd48..1e1574d970 100644
--- a/mmseg/datasets/coco_stuff.py
+++ b/mmseg/datasets/coco_stuff.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class COCOStuffDataset(CustomDataset):
+class COCOStuffDataset(BaseSegDataset):
     """COCO-Stuff dataset.
 
     In segmentation map annotation for COCO-Stuff, Train-IDs of the 10k version
@@ -91,6 +91,9 @@ class COCOStuffDataset(CustomDataset):
                  [192, 192, 0], [128, 64, 96], [192, 32, 64], [192, 64, 128],
                  [64, 192, 96], [64, 160, 64], [64, 64, 0]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='_labelTrainIds.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.jpg', seg_map_suffix='_labelTrainIds.png', **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/custom.py b/mmseg/datasets/custom.py
deleted file mode 100644
index e526b450b0..0000000000
--- a/mmseg/datasets/custom.py
+++ /dev/null
@@ -1,267 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import os.path as osp
-from typing import Callable, Dict, List, Optional, Sequence, Union
-
-import mmcv
-import numpy as np
-from mmengine.dataset import BaseDataset, Compose
-
-from mmseg.registry import DATASETS
-
-
-@DATASETS.register_module()
-class CustomDataset(BaseDataset):
-    """Custom dataset for semantic segmentation. An example of file structure
-    is as followed.
-
-    .. code-block:: none
-
-        ├── data
-        │   ├── my_dataset
-        │   │   ├── img_dir
-        │   │   │   ├── train
-        │   │   │   │   ├── xxx{img_suffix}
-        │   │   │   │   ├── yyy{img_suffix}
-        │   │   │   │   ├── zzz{img_suffix}
-        │   │   │   ├── val
-        │   │   ├── ann_dir
-        │   │   │   ├── train
-        │   │   │   │   ├── xxx{seg_map_suffix}
-        │   │   │   │   ├── yyy{seg_map_suffix}
-        │   │   │   │   ├── zzz{seg_map_suffix}
-        │   │   │   ├── val
-
-    The img/gt_semantic_seg pair of CustomDataset should be of the same
-    except suffix. A valid img/gt_semantic_seg filename pair should be like
-    ``xxx{img_suffix}`` and ``xxx{seg_map_suffix}`` (extension is also included
-    in the suffix). If split is given, then ``xxx`` is specified in txt file.
-    Otherwise, all files in ``img_dir/``and ``ann_dir`` will be loaded.
-    Please refer to ``docs/en/tutorials/new_dataset.md`` for more details.
-
-
-    Args:
-        ann_file (str): Annotation file path. Defaults to ''.
-        metainfo (dict, optional): Meta information for dataset, such as
-            specify classes to load. Defaults to None.
-        data_root (str, optional): The root directory for ``data_prefix`` and
-            ``ann_file``. Defaults to None.
-        data_prefix (dict, optional): Prefix for training data. Defaults to
-            dict(img_path=None, seg_path=None).
-        img_suffix (str): Suffix of images. Default: '.jpg'
-        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
-        filter_cfg (dict, optional): Config for filter data. Defaults to None.
-        indices (int or Sequence[int], optional): Support using first few
-            data in annotation file to facilitate training/testing on a smaller
-            dataset. Defaults to None which means using all ``data_infos``.
-        serialize_data (bool, optional): Whether to hold memory using
-            serialized objects, when enabled, data loader workers can use
-            shared RAM from master process instead of making a copy. Defaults
-            to True.
-        pipeline (list, optional): Processing pipeline. Defaults to [].
-        test_mode (bool, optional): ``test_mode=True`` means in test phase.
-            Defaults to False.
-        lazy_init (bool, optional): Whether to load annotation during
-            instantiation. In some cases, such as visualization, only the meta
-            information of the dataset is needed, which is not necessary to
-            load annotation file. ``Basedataset`` can skip load annotations to
-            save time by set ``lazy_init=False``. Defaults to False.
-        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
-            None img. The maximum extra number of cycles to get a valid
-            image. Defaults to 1000.
-        ignore_index (int): The label index to be ignored. Default: 255
-        reduce_zero_label (bool): Whether to mark label zero as ignored.
-            Default to False.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmcv.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
-    """
-    METAINFO: dict = dict()
-
-    def __init__(
-        self,
-        ann_file: str = '',
-        img_suffix='.jpg',
-        seg_map_suffix='.png',
-        metainfo: Optional[dict] = None,
-        data_root: Optional[str] = None,
-        data_prefix: dict = dict(img_path=None, seg_map_path=None),
-        filter_cfg: Optional[dict] = None,
-        indices: Optional[Union[int, Sequence[int]]] = None,
-        serialize_data: bool = True,
-        pipeline: List[Union[dict, Callable]] = [],
-        test_mode: bool = False,
-        lazy_init: bool = False,
-        max_refetch: int = 1000,
-        ignore_index: int = 255,
-        reduce_zero_label: bool = False,
-        file_client_args: dict = dict(backend='disk')
-    ) -> None:
-
-        self.img_suffix = img_suffix
-        self.seg_map_suffix = seg_map_suffix
-        self.ignore_index = ignore_index
-        self.reduce_zero_label = reduce_zero_label
-        self.file_client_args = file_client_args
-        self.file_client = mmcv.FileClient.infer_client(self.file_client_args)
-
-        self.data_root = data_root
-        self.data_prefix = copy.copy(data_prefix)
-        self.ann_file = ann_file
-        self.filter_cfg = copy.deepcopy(filter_cfg)
-        self._indices = indices
-        self.serialize_data = serialize_data
-        self.test_mode = test_mode
-        self.max_refetch = max_refetch
-        self.data_list: List[dict] = []
-        self.data_bytes: np.ndarray
-
-        # Set meta information.
-        self._metainfo = self._load_metainfo(copy.deepcopy(metainfo))
-
-        # Get label map for custom classes
-        new_classes = self._metainfo.get('classes', None)
-        self.label_map = self.get_label_map(new_classes)
-        self._metainfo.update(
-            dict(
-                label_map=self.label_map,
-                reduce_zero_label=self.reduce_zero_label))
-
-        # Update palette based on label map or generate palette
-        # if it is not defined
-        updated_palette = self._update_palette()
-        self._metainfo.update(dict(palette=updated_palette))
-        if test_mode:
-            assert self._metainfo.get('classes') is not None, \
-                'dataset metainfo `classes` should be specified when testing'
-
-        # Join paths.
-        if self.data_root is not None:
-            self._join_prefix()
-
-        # Build pipeline.
-        self.pipeline = Compose(pipeline)
-        # Full initialize the dataset.
-        if not lazy_init:
-            self.full_init()
-
-    @classmethod
-    def get_label_map(cls,
-                      new_classes: Optional[Sequence] = None
-                      ) -> Union[Dict, None]:
-        """Require label mapping.
-
-        The ``label_map`` is a dictionary, its keys are the old label ids and
-        its values are the new label ids, and is used for changing pixel
-        labels in load_annotations. If and only if old classes in cls.METAINFO
-        is not equal to new classes in self._metainfo and nether of them is not
-        None, `label_map` is not None.
-
-        Args:
-            new_classes (list, tuple, optional): The new classes name from
-                metainfo. Default to None.
-
-
-        Returns:
-            dict, optional: The mapping from old classes in cls.METAINFO to
-                new classes in self._metainfo
-        """
-        old_classes = cls.METAINFO.get('classes', None)
-        if (new_classes is not None and old_classes is not None
-                and list(new_classes) != list(old_classes)):
-
-            label_map = {}
-            if not set(new_classes).issubset(cls.METAINFO['classes']):
-                raise ValueError(
-                    f'new classes {new_classes} is not a '
-                    f'subset of classes {old_classes} in METAINFO.')
-            for i, c in enumerate(old_classes):
-                if c not in new_classes:
-                    label_map[i] = -1
-                else:
-                    label_map[i] = new_classes.index(c)
-            return label_map
-        else:
-            return None
-
-    def _update_palette(self) -> list:
-        """Update palette after loading metainfo.
-
-        If length of palette is equal to classes, just return the palette.
-        If palette is not defined, it will randomly generate a palette.
-        If classes is updated by customer, it will return the subset of
-        palette.
-
-        Returns:
-            Sequence: Palette for current dataset.
-        """
-        palette = self._metainfo.get('palette', [])
-        classes = self._metainfo.get('classes', [])
-        # palette does match classes
-        if len(palette) == len(classes):
-            return palette
-
-        if len(palette) == 0:
-            # Get random state before set seed, and restore
-            # random state later.
-            # It will prevent loss of randomness, as the palette
-            # may be different in each iteration if not specified.
-            # See: https://github.com/open-mmlab/mmdetection/issues/5844
-            state = np.random.get_state()
-            np.random.seed(42)
-            # random palette
-            new_palette = np.random.randint(
-                0, 255, size=(len(classes), 3)).tolist()
-            np.random.set_state(state)
-        elif len(palette) >= len(classes) and self.label_map is not None:
-            new_palette = []
-            # return subset of palette
-            for old_id, new_id in sorted(
-                    self.label_map.items(), key=lambda x: x[1]):
-                if new_id != -1:
-                    new_palette.append(palette[old_id])
-            new_palette = type(palette)(new_palette)
-        else:
-            raise ValueError('palette does not match classes '
-                             f'as metainfo is {self._metainfo}.')
-        return new_palette
-
-    def load_data_list(self) -> List[dict]:
-        """Load annotation from directory or annotation file.
-
-        Returns:
-            list[dict]: All data info of dataset.
-        """
-        data_list = []
-        img_dir = self.data_prefix.get('img_path', None)
-        ann_dir = self.data_prefix.get('seg_map_path', None)
-        if osp.isfile(self.ann_file):
-            lines = mmcv.list_from_file(
-                self.ann_file, file_client_args=self.file_client_args)
-            for line in lines:
-                img_name = line.strip()
-                data_info = dict(
-                    img_path=osp.join(img_dir, img_name + self.img_suffix))
-                if ann_dir is not None:
-                    seg_map = img_name + self.seg_map_suffix
-                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
-                data_info['label_map'] = self.label_map
-                data_info['reduce_zero_label'] = self.reduce_zero_label
-                data_info['seg_fields'] = []
-                data_list.append(data_info)
-        else:
-            for img in self.file_client.list_dir_or_file(
-                    dir_path=img_dir,
-                    list_dir=False,
-                    suffix=self.img_suffix,
-                    recursive=True):
-                data_info = dict(img_path=osp.join(img_dir, img))
-                if ann_dir is not None:
-                    seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
-                    data_info['seg_map_path'] = osp.join(ann_dir, seg_map)
-                data_info['label_map'] = self.label_map
-                data_info['reduce_zero_label'] = self.reduce_zero_label
-                data_info['seg_fields'] = []
-                data_list.append(data_info)
-            data_list = sorted(data_list, key=lambda x: x['img_path'])
-        return data_list
diff --git a/mmseg/datasets/dark_zurich.py b/mmseg/datasets/dark_zurich.py
index c59249a023..9b5393fa9e 100644
--- a/mmseg/datasets/dark_zurich.py
+++ b/mmseg/datasets/dark_zurich.py
@@ -7,8 +7,9 @@
 class DarkZurichDataset(CityscapesDataset):
     """DarkZurichDataset dataset."""
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='_rgb_anon.png',
+                 seg_map_suffix='_gt_labelTrainIds.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='_rgb_anon.png',
-            seg_map_suffix='_gt_labelTrainIds.png',
-            **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/dataset_wrappers.py b/mmseg/datasets/dataset_wrappers.py
index 57136e33f1..082c116ff4 100644
--- a/mmseg/datasets/dataset_wrappers.py
+++ b/mmseg/datasets/dataset_wrappers.py
@@ -105,12 +105,12 @@ def __getitem__(self, idx):
                     transform_type in self._skip_type_keys:
                 continue
 
-            if hasattr(transform, 'get_indexes'):
-                indexes = transform.get_indexes(self.dataset)
-                if not isinstance(indexes, collections.abc.Sequence):
-                    indexes = [indexes]
+            if hasattr(transform, 'get_indices'):
+                indices = transform.get_indices(self.dataset)
+                if not isinstance(indices, collections.abc.Sequence):
+                    indices = [indices]
                 mix_results = [
-                    copy.deepcopy(self.dataset[index]) for index in indexes
+                    copy.deepcopy(self.dataset[index]) for index in indices
                 ]
                 results['mix_results'] = mix_results
 
diff --git a/mmseg/datasets/decathlon.py b/mmseg/datasets/decathlon.py
new file mode 100644
index 0000000000..26aa4ef0d7
--- /dev/null
+++ b/mmseg/datasets/decathlon.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import copy
+import os.path as osp
+from typing import List
+
+from mmengine.fileio import load
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class DecathlonDataset(BaseSegDataset):
+    """Dataset for Dacathlon dataset.
+
+    The dataset.json format is shown as follows
+
+    .. code-block:: none
+
+        {
+            "name": "BRATS",
+            "tensorImageSize": "4D",
+            "modality":
+            {
+                "0": "FLAIR",
+                "1": "T1w",
+                "2": "t1gd",
+                "3": "T2w"
+            },
+            "labels": {
+                "0": "background",
+                "1": "edema",
+                "2": "non-enhancing tumor",
+                "3": "enhancing tumour"
+            },
+            "numTraining": 484,
+            "numTest": 266,
+            "training":
+            [
+                {
+                    "image": "./imagesTr/BRATS_306.nii.gz"
+                    "label": "./labelsTr/BRATS_306.nii.gz"
+                    ...
+                }
+            ]
+            "test":
+            [
+                "./imagesTs/BRATS_557.nii.gz"
+                ...
+            ]
+        }
+    """
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        # `self.ann_file` denotes the absolute annotation file path if
+        # `self.root=None` or relative path if `self.root=/path/to/data/`.
+        annotations = load(self.ann_file)
+        if not isinstance(annotations, dict):
+            raise TypeError(f'The annotations loaded from annotation file '
+                            f'should be a dict, but got {type(annotations)}!')
+        raw_data_list = annotations[
+            'training'] if not self.test_mode else annotations['test']
+        data_list = []
+        for raw_data_info in raw_data_list:
+            # `2:` works for removing './' in file path, which will break
+            # loading from cloud storage.
+            if isinstance(raw_data_info, dict):
+                data_info = dict(
+                    img_path=osp.join(self.data_root, raw_data_info['image']
+                                      [2:]))
+                data_info['seg_map_path'] = osp.join(
+                    self.data_root, raw_data_info['label'][2:])
+            else:
+                data_info = dict(
+                    img_path=osp.join(self.data_root, raw_data_info)[2:])
+            data_info['label_map'] = self.label_map
+            data_info['reduce_zero_label'] = self.reduce_zero_label
+            data_info['seg_fields'] = []
+            data_list.append(data_info)
+        annotations.pop('training')
+        annotations.pop('test')
+
+        metainfo = copy.deepcopy(annotations)
+        metainfo['classes'] = [*metainfo['labels'].values()]
+        # Meta information load from annotation file will not influence the
+        # existed meta information load from `BaseDataset.METAINFO` and
+        # `metainfo` arguments defined in constructor.
+        for k, v in metainfo.items():
+            self._metainfo.setdefault(k, v)
+
+        return data_list
diff --git a/mmseg/datasets/drive.py b/mmseg/datasets/drive.py
index 4d78f2dfae..76c0160a6b 100644
--- a/mmseg/datasets/drive.py
+++ b/mmseg/datasets/drive.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
 
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class DRIVEDataset(CustomDataset):
+class DRIVEDataset(BaseSegDataset):
     """DRIVE dataset.
 
     In segmentation map annotation for DRIVE, 0 stands for background, which is
@@ -17,10 +18,15 @@ class DRIVEDataset(CustomDataset):
         classes=('background', 'vessel'),
         palette=[[120, 120, 120], [6, 230, 230]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_manual1.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_manual1.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
-        assert self.file_client.exists(self.data_prefix['img_path'])
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/mmseg/datasets/dsdl.py b/mmseg/datasets/dsdl.py
new file mode 100644
index 0000000000..bf7e4e61b5
--- /dev/null
+++ b/mmseg/datasets/dsdl.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+from typing import Dict, List, Optional, Sequence, Union
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+try:
+    from dsdl.dataset import DSDLDataset
+except ImportError:
+    DSDLDataset = None
+
+
+@DATASETS.register_module()
+class DSDLSegDataset(BaseSegDataset):
+    """Dataset for dsdl segmentation.
+
+    Args:
+        specific_key_path(dict): Path of specific key which can not
+            be loaded by it's field name.
+        pre_transform(dict): pre-transform functions before loading.
+        used_labels(sequence): list of actual used classes in train steps,
+            this must be subset of class domain.
+    """
+
+    METAINFO = {}
+
+    def __init__(self,
+                 specific_key_path: Dict = {},
+                 pre_transform: Dict = {},
+                 used_labels: Optional[Sequence] = None,
+                 **kwargs) -> None:
+
+        if DSDLDataset is None:
+            raise RuntimeError(
+                'Package dsdl is not installed. Please run "pip install dsdl".'
+            )
+        self.used_labels = used_labels
+
+        loc_config = dict(type='LocalFileReader', working_dir='')
+        if kwargs.get('data_root'):
+            kwargs['ann_file'] = os.path.join(kwargs['data_root'],
+                                              kwargs['ann_file'])
+        required_fields = ['Image', 'LabelMap']
+
+        self.dsdldataset = DSDLDataset(
+            dsdl_yaml=kwargs['ann_file'],
+            location_config=loc_config,
+            required_fields=required_fields,
+            specific_key_path=specific_key_path,
+            transform=pre_transform,
+        )
+        BaseSegDataset.__init__(self, **kwargs)
+
+    def load_data_list(self) -> List[Dict]:
+        """Load data info from a dsdl yaml file named as ``self.ann_file``
+
+        Returns:
+            List[dict]: A list of data list.
+        """
+
+        if self.used_labels:
+            self._metainfo['classes'] = tuple(self.used_labels)
+            self.label_map = self.get_label_map(self.used_labels)
+        else:
+            self._metainfo['classes'] = tuple(['background'] +
+                                              self.dsdldataset.class_names)
+        data_list = []
+
+        for i, data in enumerate(self.dsdldataset):
+            datainfo = dict(
+                img_path=os.path.join(self.data_prefix['img_path'],
+                                      data['Image'][0].location),
+                seg_map_path=os.path.join(self.data_prefix['seg_map_path'],
+                                          data['LabelMap'][0].location),
+                label_map=self.label_map,
+                reduce_zero_label=self.reduce_zero_label,
+                seg_fields=[],
+            )
+            data_list.append(datainfo)
+
+        return data_list
+
+    def get_label_map(self,
+                      new_classes: Optional[Sequence] = None
+                      ) -> Union[Dict, None]:
+        """Require label mapping.
+
+        The ``label_map`` is a dictionary, its keys are the old label ids and
+        its values are the new label ids, and is used for changing pixel
+        labels in load_annotations. If and only if old classes in class_dom
+        is not equal to new classes in args and nether of them is not
+        None, `label_map` is not None.
+        Args:
+            new_classes (list, tuple, optional): The new classes name from
+                metainfo. Default to None.
+        Returns:
+            dict, optional: The mapping from old classes to new classes.
+        """
+        old_classes = ['background'] + self.dsdldataset.class_names
+        if (new_classes is not None and old_classes is not None
+                and list(new_classes) != list(old_classes)):
+
+            label_map = {}
+            if not set(new_classes).issubset(old_classes):
+                raise ValueError(
+                    f'new classes {new_classes} is not a '
+                    f'subset of classes {old_classes} in class_dom.')
+            for i, c in enumerate(old_classes):
+                if c not in new_classes:
+                    label_map[i] = 255
+                else:
+                    label_map[i] = new_classes.index(c)
+            return label_map
+        else:
+            return None
diff --git a/mmseg/datasets/hrf.py b/mmseg/datasets/hrf.py
index 996e3c2b94..fd669cce26 100644
--- a/mmseg/datasets/hrf.py
+++ b/mmseg/datasets/hrf.py
@@ -1,11 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
 
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class HRFDataset(CustomDataset):
+class HRFDataset(BaseSegDataset):
     """HRF dataset.
 
     In segmentation map annotation for HRF, 0 stands for background, which is
@@ -17,10 +18,15 @@ class HRFDataset(CustomDataset):
         classes=('background', 'vessel'),
         palette=[[120, 120, 120], [6, 230, 230]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
-        assert self.file_client.exists(self.data_prefix['img_path'])
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/mmseg/datasets/isaid.py b/mmseg/datasets/isaid.py
index 02a4184922..61942ec1ea 100644
--- a/mmseg/datasets/isaid.py
+++ b/mmseg/datasets/isaid.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class iSAIDDataset(CustomDataset):
+class iSAIDDataset(BaseSegDataset):
     """ iSAID: A Large-scale Dataset for Instance Segmentation in Aerial Images
     In segmentation map annotation for iSAID dataset, which is included
     in 16 categories. ``reduce_zero_label`` is fixed to False. The
@@ -23,10 +25,15 @@ class iSAIDDataset(CustomDataset):
                  [0, 0, 127], [0, 0, 191], [0, 0, 255], [0, 191, 127],
                  [0, 127, 191], [0, 127, 255], [0, 100, 155]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='_instance_color_RGB.png',
+                 ignore_index=255,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='_instance_color_RGB.png',
-            ignore_index=255,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            ignore_index=ignore_index,
             **kwargs)
-        assert self.file_client.exists(self.data_prefix['img_path'])
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/mmseg/datasets/isprs.py b/mmseg/datasets/isprs.py
index 888ea47628..30af53c569 100644
--- a/mmseg/datasets/isprs.py
+++ b/mmseg/datasets/isprs.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class ISPRSDataset(CustomDataset):
+class ISPRSDataset(BaseSegDataset):
     """ISPRS dataset.
 
     In segmentation map annotation for ISPRS, 0 is the ignore index.
@@ -17,9 +17,13 @@ class ISPRSDataset(CustomDataset):
         palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
                  [255, 255, 0], [255, 0, 0]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
diff --git a/mmseg/datasets/levir.py b/mmseg/datasets/levir.py
new file mode 100644
index 0000000000..f467481bad
--- /dev/null
+++ b/mmseg/datasets/levir.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseCDDataset
+
+
+@DATASETS.register_module()
+class LEVIRCDDataset(BaseCDDataset):
+    """ISPRS dataset.
+
+    In segmentation map annotation for ISPRS, 0 is to ignore index.
+    ``reduce_zero_label`` should be set to True. The ``img_suffix`` and
+    ``seg_map_suffix`` are both fixed to '.png'.
+    """
+
+    METAINFO = dict(
+        classes=('background', 'changed'),
+        palette=[[0, 0, 0], [255, 255, 255]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 img_suffix2='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            img_suffix2=img_suffix2,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/mmseg/datasets/lip.py b/mmseg/datasets/lip.py
new file mode 100644
index 0000000000..3a32a193af
--- /dev/null
+++ b/mmseg/datasets/lip.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class LIPDataset(BaseSegDataset):
+    """LIP dataset.
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('Background', 'Hat', 'Hair', 'Glove', 'Sunglasses',
+                 'UpperClothes', 'Dress', 'Coat', 'Socks', 'Pants',
+                 'Jumpsuits', 'Scarf', 'Skirt', 'Face', 'Left-arm',
+                 'Right-arm', 'Left-leg', 'Right-leg', 'Left-shoe',
+                 'Right-shoe'),
+        palette=(
+            [0, 0, 0],
+            [128, 0, 0],
+            [255, 0, 0],
+            [0, 85, 0],
+            [170, 0, 51],
+            [255, 85, 0],
+            [0, 0, 85],
+            [0, 119, 221],
+            [85, 85, 0],
+            [0, 85, 85],
+            [85, 51, 0],
+            [52, 86, 128],
+            [0, 128, 0],
+            [0, 0, 255],
+            [51, 170, 221],
+            [0, 255, 255],
+            [85, 255, 170],
+            [170, 255, 85],
+            [255, 255, 0],
+            [255, 170, 0],
+        ))
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/loveda.py b/mmseg/datasets/loveda.py
index 00f7881cd7..5c16db503a 100644
--- a/mmseg/datasets/loveda.py
+++ b/mmseg/datasets/loveda.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class LoveDADataset(CustomDataset):
+class LoveDADataset(BaseSegDataset):
     """LoveDA dataset.
 
     In segmentation map annotation for LoveDA, 0 is the ignore index.
@@ -17,9 +17,13 @@ class LoveDADataset(CustomDataset):
         palette=[[255, 255, 255], [255, 0, 0], [255, 255, 0], [0, 0, 255],
                  [159, 129, 183], [0, 255, 0], [255, 195, 128]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
diff --git a/mmseg/datasets/mapillary.py b/mmseg/datasets/mapillary.py
new file mode 100644
index 0000000000..6c2947338e
--- /dev/null
+++ b/mmseg/datasets/mapillary.py
@@ -0,0 +1,176 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class MapillaryDataset_v1(BaseSegDataset):
+    """Mapillary Vistas Dataset.
+
+    Dataset paper link:
+    http://ieeexplore.ieee.org/document/8237796/
+
+    v1.2 contain 66 object classes.
+    (37 instance-specific)
+
+    v2.0 contain 124 object classes.
+    (70 instance-specific, 46 stuff, 8 void or crowd).
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png' for Mapillary Vistas Dataset.
+    """
+    METAINFO = dict(
+        classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail',
+                 'Barrier', 'Wall', 'Bike Lane', 'Crosswalk - Plain',
+                 'Curb Cut', 'Parking', 'Pedestrian Area', 'Rail Track',
+                 'Road', 'Service Lane', 'Sidewalk', 'Bridge', 'Building',
+                 'Tunnel', 'Person', 'Bicyclist', 'Motorcyclist',
+                 'Other Rider', 'Lane Marking - Crosswalk',
+                 'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow',
+                 'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench',
+                 'Bike Rack', 'Billboard', 'Catch Basin', 'CCTV Camera',
+                 'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole',
+                 'Phone Booth', 'Pothole', 'Street Light', 'Pole',
+                 'Traffic Sign Frame', 'Utility Pole', 'Traffic Light',
+                 'Traffic Sign (Back)', 'Traffic Sign (Front)', 'Trash Can',
+                 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan', 'Motorcycle',
+                 'On Rails', 'Other Vehicle', 'Trailer', 'Truck',
+                 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled'),
+        palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+                 [180, 165, 180], [90, 120, 150], [102, 102, 156],
+                 [128, 64, 255], [140, 140, 200], [170, 170, 170],
+                 [250, 170, 160], [96, 96, 96],
+                 [230, 150, 140], [128, 64, 128], [110, 110, 110],
+                 [244, 35, 232], [150, 100, 100], [70, 70, 70], [150, 120, 90],
+                 [220, 20, 60], [255, 0, 0], [255, 0, 100], [255, 0, 200],
+                 [200, 128, 128], [255, 255, 255], [64, 170,
+                                                    64], [230, 160, 50],
+                 [70, 130, 180], [190, 255, 255], [152, 251, 152],
+                 [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+                 [100, 140, 180], [220, 220, 220], [220, 128, 128],
+                 [222, 40, 40], [100, 170, 30], [40, 40, 40], [33, 33, 33],
+                 [100, 128, 160], [142, 0, 0], [70, 100, 150], [210, 170, 100],
+                 [153, 153, 153], [128, 128, 128], [0, 0, 80], [250, 170, 30],
+                 [192, 192, 192], [220, 220, 0], [140, 140, 20], [119, 11, 32],
+                 [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+                 [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+                 [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10,
+                                                         10], [0, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
+
+
+@DATASETS.register_module()
+class MapillaryDataset_v2(BaseSegDataset):
+    """Mapillary Vistas Dataset.
+
+    Dataset paper link:
+    http://ieeexplore.ieee.org/document/8237796/
+
+    v1.2 contain 66 object classes.
+    (37 instance-specific)
+
+    v2.0 contain 124 object classes.
+    (70 instance-specific, 46 stuff, 8 void or crowd).
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png' for Mapillary Vistas Dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'Bird', 'Ground Animal', 'Ambiguous Barrier', 'Concrete Block',
+            'Curb', 'Fence', 'Guard Rail', 'Barrier', 'Road Median',
+            'Road Side', 'Lane Separator', 'Temporary Barrier', 'Wall',
+            'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Driveway',
+            'Parking', 'Parking Aisle', 'Pedestrian Area', 'Rail Track',
+            'Road', 'Road Shoulder', 'Service Lane', 'Sidewalk',
+            'Traffic Island', 'Bridge', 'Building', 'Garage', 'Tunnel',
+            'Person', 'Person Group', 'Bicyclist', 'Motorcyclist',
+            'Other Rider', 'Lane Marking - Dashed Line',
+            'Lane Marking - Straight Line', 'Lane Marking - Zigzag Line',
+            'Lane Marking - Ambiguous', 'Lane Marking - Arrow (Left)',
+            'Lane Marking - Arrow (Other)', 'Lane Marking - Arrow (Right)',
+            'Lane Marking - Arrow (Split Left or Straight)',
+            'Lane Marking - Arrow (Split Right or Straight)',
+            'Lane Marking - Arrow (Straight)', 'Lane Marking - Crosswalk',
+            'Lane Marking - Give Way (Row)',
+            'Lane Marking - Give Way (Single)',
+            'Lane Marking - Hatched (Chevron)',
+            'Lane Marking - Hatched (Diagonal)', 'Lane Marking - Other',
+            'Lane Marking - Stop Line', 'Lane Marking - Symbol (Bicycle)',
+            'Lane Marking - Symbol (Other)', 'Lane Marking - Text',
+            'Lane Marking (only) - Dashed Line',
+            'Lane Marking (only) - Crosswalk', 'Lane Marking (only) - Other',
+            'Lane Marking (only) - Test', 'Mountain', 'Sand', 'Sky', 'Snow',
+            'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack',
+            'Catch Basin', 'CCTV Camera', 'Fire Hydrant', 'Junction Box',
+            'Mailbox', 'Manhole', 'Parking Meter', 'Phone Booth', 'Pothole',
+            'Signage - Advertisement', 'Signage - Ambiguous', 'Signage - Back',
+            'Signage - Information', 'Signage - Other', 'Signage - Store',
+            'Street Light', 'Pole', 'Pole Group', 'Traffic Sign Frame',
+            'Utility Pole', 'Traffic Cone', 'Traffic Light - General (Single)',
+            'Traffic Light - Pedestrians', 'Traffic Light - General (Upright)',
+            'Traffic Light - General (Horizontal)', 'Traffic Light - Cyclists',
+            'Traffic Light - Other', 'Traffic Sign - Ambiguous',
+            'Traffic Sign (Back)', 'Traffic Sign - Direction (Back)',
+            'Traffic Sign - Direction (Front)', 'Traffic Sign (Front)',
+            'Traffic Sign - Parking', 'Traffic Sign - Temporary (Back)',
+            'Traffic Sign - Temporary (Front)', 'Trash Can', 'Bicycle', 'Boat',
+            'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle',
+            'Trailer', 'Truck', 'Vehicle Group', 'Wheeled Slow', 'Water Valve',
+            'Car Mount', 'Dynamic', 'Ego Vehicle', 'Ground', 'Static',
+            'Unlabeled'),
+        palette=[[165, 42, 42], [0, 192, 0], [250, 170, 31], [250, 170, 32],
+                 [196, 196, 196], [190, 153, 153], [180, 165, 180],
+                 [90, 120, 150], [250, 170, 33], [250, 170, 34],
+                 [128, 128, 128], [250, 170, 35], [102, 102, 156],
+                 [128, 64, 255], [140, 140, 200], [170, 170, 170],
+                 [250, 170, 36], [250, 170, 160], [250, 170, 37], [96, 96, 96],
+                 [230, 150, 140], [128, 64, 128], [110, 110, 110],
+                 [110, 110, 110], [244, 35, 232], [128, 196,
+                                                   128], [150, 100, 100],
+                 [70, 70, 70], [150, 150, 150], [150, 120, 90], [220, 20, 60],
+                 [220, 20, 60], [255, 0, 0], [255, 0, 100], [255, 0, 200],
+                 [255, 255, 255], [255, 255, 255], [250, 170, 29],
+                 [250, 170, 28], [250, 170, 26], [250, 170,
+                                                  25], [250, 170, 24],
+                 [250, 170, 22], [250, 170, 21], [250, 170,
+                                                  20], [255, 255, 255],
+                 [250, 170, 19], [250, 170, 18], [250, 170,
+                                                  12], [250, 170, 11],
+                 [255, 255, 255], [255, 255, 255], [250, 170, 16],
+                 [250, 170, 15], [250, 170, 15], [255, 255, 255],
+                 [255, 255, 255], [255, 255, 255], [255, 255, 255],
+                 [64, 170, 64], [230, 160, 50],
+                 [70, 130, 180], [190, 255, 255], [152, 251, 152],
+                 [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+                 [100, 140, 180], [220, 128, 128], [222, 40,
+                                                    40], [100, 170, 30],
+                 [40, 40, 40], [33, 33, 33], [100, 128, 160], [20, 20, 255],
+                 [142, 0, 0], [70, 100, 150], [250, 171, 30], [250, 172, 30],
+                 [250, 173, 30], [250, 174, 30], [250, 175,
+                                                  30], [250, 176, 30],
+                 [210, 170, 100], [153, 153, 153], [153, 153, 153],
+                 [128, 128, 128], [0, 0, 80], [210, 60, 60], [250, 170, 30],
+                 [250, 170, 30], [250, 170, 30], [250, 170,
+                                                  30], [250, 170, 30],
+                 [250, 170, 30], [192, 192, 192], [192, 192, 192],
+                 [192, 192, 192], [220, 220, 0], [220, 220, 0], [0, 0, 196],
+                 [192, 192, 192], [220, 220, 0], [140, 140, 20], [119, 11, 32],
+                 [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+                 [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+                 [0, 0, 70], [0, 0, 142], [0, 0, 192], [170, 170, 170],
+                 [32, 32, 32], [111, 74, 0], [120, 10, 10], [81, 0, 81],
+                 [111, 111, 0], [0, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/night_driving.py b/mmseg/datasets/night_driving.py
index 5e542194f7..3ead91ec77 100644
--- a/mmseg/datasets/night_driving.py
+++ b/mmseg/datasets/night_driving.py
@@ -7,8 +7,9 @@
 class NightDrivingDataset(CityscapesDataset):
     """NightDrivingDataset dataset."""
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='_leftImg8bit.png',
+                 seg_map_suffix='_gtCoarse_labelTrainIds.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='_leftImg8bit.png',
-            seg_map_suffix='_gtCoarse_labelTrainIds.png',
-            **kwargs)
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/nyu.py b/mmseg/datasets/nyu.py
new file mode 100644
index 0000000000..fcfda46647
--- /dev/null
+++ b/mmseg/datasets/nyu.py
@@ -0,0 +1,123 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from typing import List
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class NYUDataset(BaseSegDataset):
+    """NYU depth estimation dataset. The file structure should be.
+
+    .. code-block:: none
+
+        ├── data
+        │   ├── nyu
+        │   │   ├── images
+        │   │   │   ├── train
+        │   │   │   │   ├── scene_xxx.jpg
+        │   │   │   │   ├── ...
+        │   │   │   ├── test
+        │   │   ├── annotations
+        │   │   │   ├── train
+        │   │   │   │   ├── scene_xxx.png
+        │   │   │   │   ├── ...
+        │   │   │   ├── test
+
+    Args:
+        ann_file (str): Annotation file path. Defaults to ''.
+        metainfo (dict, optional): Meta information for dataset, such as
+            specify classes to load. Defaults to None.
+        data_root (str, optional): The root directory for ``data_prefix`` and
+            ``ann_file``. Defaults to None.
+        data_prefix (dict, optional): Prefix for training data. Defaults to
+            dict(img_path='images', depth_map_path='annotations').
+        img_suffix (str): Suffix of images. Default: '.jpg'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        filter_cfg (dict, optional): Config for filter data. Defaults to None.
+        indices (int or Sequence[int], optional): Support using first few
+            data in annotation file to facilitate training/testing on a smaller
+            dataset. Defaults to None which means using all ``data_infos``.
+        serialize_data (bool, optional): Whether to hold memory using
+            serialized objects, when enabled, data loader workers can use
+            shared RAM from master process instead of making a copy. Defaults
+            to True.
+        pipeline (list, optional): Processing pipeline. Defaults to [].
+        test_mode (bool, optional): ``test_mode=True`` means in test phase.
+            Defaults to False.
+        lazy_init (bool, optional): Whether to load annotation during
+            instantiation. In some cases, such as visualization, only the meta
+            information of the dataset is needed, which is not necessary to
+            load annotation file. ``Basedataset`` can skip load annotations to
+            save time by set ``lazy_init=True``. Defaults to False.
+        max_refetch (int, optional): If ``Basedataset.prepare_data`` get a
+            None img. The maximum extra number of cycles to get a valid
+            image. Defaults to 1000.
+        ignore_index (int): The label index to be ignored. Default: 255
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+    METAINFO = dict(
+        classes=('printer_room', 'bathroom', 'living_room', 'study',
+                 'conference_room', 'study_room', 'kitchen', 'home_office',
+                 'bedroom', 'dinette', 'playroom', 'indoor_balcony',
+                 'laundry_room', 'basement', 'excercise_room', 'foyer',
+                 'home_storage', 'cafe', 'furniture_store', 'office_kitchen',
+                 'student_lounge', 'dining_room', 'reception_room',
+                 'computer_lab', 'classroom', 'office', 'bookstore'))
+
+    def __init__(self,
+                 data_prefix=dict(
+                     img_path='images', depth_map_path='annotations'),
+                 img_suffix='.jpg',
+                 depth_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            data_prefix=data_prefix,
+            img_suffix=img_suffix,
+            seg_map_suffix=depth_map_suffix,
+            **kwargs)
+
+    def _get_category_id_from_filename(self, image_fname: str) -> int:
+        """Retrieve the category ID from the given image filename."""
+        image_fname = osp.basename(image_fname)
+        position = image_fname.find(next(filter(str.isdigit, image_fname)), 0)
+        categoty_name = image_fname[:position - 1]
+        if categoty_name not in self._metainfo['classes']:
+            return -1
+        else:
+            return self._metainfo['classes'].index(categoty_name)
+
+    def load_data_list(self) -> List[dict]:
+        """Load annotation from directory or annotation file.
+
+        Returns:
+            list[dict]: All data info of dataset.
+        """
+        data_list = []
+        img_dir = self.data_prefix.get('img_path', None)
+        ann_dir = self.data_prefix.get('depth_map_path', None)
+
+        _suffix_len = len(self.img_suffix)
+        for img in fileio.list_dir_or_file(
+                dir_path=img_dir,
+                list_dir=False,
+                suffix=self.img_suffix,
+                recursive=True,
+                backend_args=self.backend_args):
+            data_info = dict(img_path=osp.join(img_dir, img))
+            if ann_dir is not None:
+                depth_map = img[:-_suffix_len] + self.seg_map_suffix
+                data_info['depth_map_path'] = osp.join(ann_dir, depth_map)
+            data_info['seg_fields'] = []
+            data_info['category_id'] = self._get_category_id_from_filename(img)
+            data_list.append(data_info)
+        data_list = sorted(data_list, key=lambda x: x['img_path'])
+        return data_list
diff --git a/mmseg/datasets/pascal_context.py b/mmseg/datasets/pascal_context.py
index 2c0fae457b..82d00a9b30 100644
--- a/mmseg/datasets/pascal_context.py
+++ b/mmseg/datasets/pascal_context.py
@@ -1,12 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
+import mmengine.fileio as fileio
 
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class PascalContextDataset(CustomDataset):
+class PascalContextDataset(BaseSegDataset):
     """PascalContext dataset.
 
     In segmentation map annotation for PascalContext, 0 stands for background,
@@ -45,25 +45,31 @@ class PascalContextDataset(CustomDataset):
                  [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
                  [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
 
-    def __init__(self, ann_file: str, **kwargs) -> None:
+    def __init__(self,
+                 ann_file='',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
             ann_file=ann_file,
-            reduce_zero_label=False,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
-        assert self.file_client.exists(
-            self.data_prefix['img_path']) and osp.isfile(self.ann_file)
+        assert fileio.exists(self.data_prefix['img_path'], self.backend_args)
 
 
 @DATASETS.register_module()
-class PascalContextDataset59(CustomDataset):
+class PascalContextDataset59(BaseSegDataset):
     """PascalContext dataset.
 
     In segmentation map annotation for PascalContext, 0 stands for background,
     which is included in 60 categories. ``reduce_zero_label`` is fixed to
-    False. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    True. The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
     fixed to '.png'.
+    Noted: If the background is 255 and the ids of categories are from 0 to 58,
+    ``reduce_zero_label`` needs to be set to False.
 
     Args:
         ann_file (str): Annotation file path.
@@ -95,12 +101,16 @@ class PascalContextDataset59(CustomDataset):
                  [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
                  [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255]])
 
-    def __init__(self, ann_file: str, **kwargs):
+    def __init__(self,
+                 ann_file='',
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs):
         super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
             ann_file=ann_file,
-            reduce_zero_label=True,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
-        assert self.file_client.exists(
-            self.data_prefix['img_path']) and osp.isfile(self.ann_file)
+        assert fileio.exists(self.data_prefix['img_path'], self.backend_args)
diff --git a/mmseg/datasets/potsdam.py b/mmseg/datasets/potsdam.py
index 65e23ecb4b..6892de3dd2 100644
--- a/mmseg/datasets/potsdam.py
+++ b/mmseg/datasets/potsdam.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class PotsdamDataset(CustomDataset):
+class PotsdamDataset(BaseSegDataset):
     """ISPRS Potsdam dataset.
 
     In segmentation map annotation for Potsdam dataset, 0 is the ignore index.
@@ -17,9 +17,13 @@ class PotsdamDataset(CustomDataset):
         palette=[[255, 255, 255], [0, 0, 255], [0, 255, 255], [0, 255, 0],
                  [255, 255, 0], [255, 0, 0]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=True,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.png',
-            reduce_zero_label=True,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
diff --git a/mmseg/datasets/refuge.py b/mmseg/datasets/refuge.py
new file mode 100644
index 0000000000..4016a825a3
--- /dev/null
+++ b/mmseg/datasets/refuge.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class REFUGEDataset(BaseSegDataset):
+    """REFUGE dataset.
+
+    In segmentation map annotation for REFUGE, 0 stands for background, which
+    is not included in 2 categories. ``reduce_zero_label`` is fixed to True.
+    The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is fixed to
+    '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', ' Optic Cup', 'Optic Disc'),
+        palette=[[120, 120, 120], [6, 230, 230], [56, 59, 120]])
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__(
+            img_suffix='.png',
+            seg_map_suffix='.png',
+            reduce_zero_label=False,
+            **kwargs)
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/mmseg/datasets/stare.py b/mmseg/datasets/stare.py
index 53defc433c..1b997bb785 100644
--- a/mmseg/datasets/stare.py
+++ b/mmseg/datasets/stare.py
@@ -1,10 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class STAREDataset(CustomDataset):
+class STAREDataset(BaseSegDataset):
     """STARE dataset.
 
     In segmentation map annotation for STARE, 0 stands for background, which is
@@ -16,10 +18,15 @@ class STAREDataset(CustomDataset):
         classes=('background', 'vessel'),
         palette=[[120, 120, 120], [6, 230, 230]])
 
-    def __init__(self, **kwargs) -> None:
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.ah.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.png',
-            seg_map_suffix='.ah.png',
-            reduce_zero_label=False,
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
             **kwargs)
-        assert self.file_client.exists(self.data_prefix['img_path'])
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/mmseg/datasets/synapse.py b/mmseg/datasets/synapse.py
new file mode 100644
index 0000000000..6f83b64150
--- /dev/null
+++ b/mmseg/datasets/synapse.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.registry import DATASETS
+from .basesegdataset import BaseSegDataset
+
+
+@DATASETS.register_module()
+class SynapseDataset(BaseSegDataset):
+    """Synapse dataset.
+
+    Before dataset preprocess of Synapse, there are total 13 categories of
+    foreground which does not include background. After preprocessing, 8
+    foreground categories are kept while the other 5 foreground categories are
+    handled as background. The ``img_suffix`` is fixed to '.jpg' and
+    ``seg_map_suffix`` is fixed to '.png'.
+    """
+    METAINFO = dict(
+        classes=('background', 'aorta', 'gallbladder', 'left_kidney',
+                 'right_kidney', 'liver', 'pancreas', 'spleen', 'stomach'),
+        palette=[[0, 0, 0], [0, 0, 255], [0, 255, 0], [255, 0, 0],
+                 [0, 255, 255], [255, 0, 255], [255, 255, 0], [60, 255, 255],
+                 [240, 240, 240]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/mmseg/datasets/transforms/__init__.py b/mmseg/datasets/transforms/__init__.py
index ec6d679163..125f070818 100644
--- a/mmseg/datasets/transforms/__init__.py
+++ b/mmseg/datasets/transforms/__init__.py
@@ -1,21 +1,30 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.transforms import (LoadImageFromFile, MultiScaleFlipAug, Normalize,
-                             Pad, RandomChoiceResize, RandomFlip, RandomResize,
-                             Resize)
-
-from .compose import Compose
-from .formatting import (ImageToTensor, PackSegInputs, ToDataContainer,
-                         Transpose)
-from .loading import LoadAnnotations
-from .transforms import (CLAHE, AdjustGamma, PhotoMetricDistortion, RandomCrop,
-                         RandomCutOut, RandomMosaic, RandomRotate, Rerange,
-                         RGB2Gray, SegRescale)
+from .formatting import PackSegInputs
+from .loading import (LoadAnnotations, LoadBiomedicalAnnotation,
+                      LoadBiomedicalData, LoadBiomedicalImageFromFile,
+                      LoadDepthAnnotation, LoadImageFromNDArray,
+                      LoadMultipleRSImageFromFile, LoadSingleRSImageFromFile)
+# yapf: disable
+from .transforms import (CLAHE, AdjustGamma, Albu, BioMedical3DPad,
+                         BioMedical3DRandomCrop, BioMedical3DRandomFlip,
+                         BioMedicalGaussianBlur, BioMedicalGaussianNoise,
+                         BioMedicalRandomGamma, ConcatCDInput, GenerateEdge,
+                         PhotoMetricDistortion, RandomCrop, RandomCutOut,
+                         RandomDepthMix, RandomFlip, RandomMosaic,
+                         RandomRotate, RandomRotFlip, Rerange, Resize,
+                         ResizeShortestEdge, ResizeToMultiple, RGB2Gray,
+                         SegRescale)
 
+# yapf: enable
 __all__ = [
-    'Compose', 'ImageToTensor', 'ToDataContainer', 'Transpose',
-    'LoadAnnotations', 'LoadImageFromFile', 'RandomFlip', 'Pad', 'RandomCrop',
-    'Normalize', 'SegRescale', 'PhotoMetricDistortion', 'RandomRotate',
-    'AdjustGamma', 'CLAHE', 'Rerange', 'RGB2Gray', 'RandomCutOut',
-    'RandomMosaic', 'PackSegInputs', 'Resize', 'RandomResize',
-    'RandomChoiceResize', 'MultiScaleFlipAug'
+    'LoadAnnotations', 'RandomCrop', 'BioMedical3DRandomCrop', 'SegRescale',
+    'PhotoMetricDistortion', 'RandomRotate', 'AdjustGamma', 'CLAHE', 'Rerange',
+    'RGB2Gray', 'RandomCutOut', 'RandomMosaic', 'PackSegInputs',
+    'ResizeToMultiple', 'LoadImageFromNDArray', 'LoadBiomedicalImageFromFile',
+    'LoadBiomedicalAnnotation', 'LoadBiomedicalData', 'GenerateEdge',
+    'ResizeShortestEdge', 'BioMedicalGaussianNoise', 'BioMedicalGaussianBlur',
+    'BioMedical3DRandomFlip', 'BioMedicalRandomGamma', 'BioMedical3DPad',
+    'RandomRotFlip', 'Albu', 'LoadSingleRSImageFromFile', 'ConcatCDInput',
+    'LoadMultipleRSImageFromFile', 'LoadDepthAnnotation', 'RandomDepthMix',
+    'RandomFlip', 'Resize'
 ]
diff --git a/mmseg/datasets/transforms/compose.py b/mmseg/datasets/transforms/compose.py
deleted file mode 100644
index 5bfaa7046c..0000000000
--- a/mmseg/datasets/transforms/compose.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import collections
-
-from mmseg.registry import TRANSFORMS
-
-
-@TRANSFORMS.register_module()
-class Compose(object):
-    """Compose multiple transforms sequentially.
-
-    Args:
-        transforms (Sequence[dict | callable]): Sequence of transform object or
-            config dict to be composed.
-    """
-
-    def __init__(self, transforms):
-        assert isinstance(transforms, collections.abc.Sequence)
-        self.transforms = []
-        for transform in transforms:
-            if isinstance(transform, dict):
-                transform = TRANSFORMS.build(transform)
-                self.transforms.append(transform)
-            elif callable(transform):
-                self.transforms.append(transform)
-            else:
-                raise TypeError('transform must be callable or a dict')
-
-    def __call__(self, data):
-        """Call function to apply transforms sequentially.
-
-        Args:
-            data (dict): A result dict contains the data to transform.
-
-        Returns:
-           dict: Transformed data.
-        """
-
-        for t in self.transforms:
-            data = t(data)
-            if data is None:
-                return None
-        return data
-
-    def __repr__(self):
-        format_string = self.__class__.__name__ + '('
-        for t in self.transforms:
-            format_string += '\n'
-            format_string += f'    {t}'
-        format_string += '\n)'
-        return format_string
diff --git a/mmseg/datasets/transforms/formatting.py b/mmseg/datasets/transforms/formatting.py
index 6f4c9318a2..bd250551e9 100644
--- a/mmseg/datasets/transforms/formatting.py
+++ b/mmseg/datasets/transforms/formatting.py
@@ -1,12 +1,13 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
 import numpy as np
-from mmcv.parallel import DataContainer as DC
 from mmcv.transforms import to_tensor
 from mmcv.transforms.base import BaseTransform
-from mmengine.data import PixelData
+from mmengine.structures import PixelData
 
-from mmseg.data import SegDataSample
 from mmseg.registry import TRANSFORMS
+from mmseg.structures import SegDataSample
 
 
 @TRANSFORMS.register_module()
@@ -43,7 +44,7 @@ class PackSegInputs(BaseTransform):
     def __init__(self,
                  meta_keys=('img_path', 'seg_map_path', 'ori_shape',
                             'img_shape', 'pad_shape', 'scale_factor', 'flip',
-                            'flip_direction')):
+                            'flip_direction', 'reduce_zero_label')):
         self.meta_keys = meta_keys
 
     def transform(self, results: dict) -> dict:
@@ -64,22 +65,44 @@ def transform(self, results: dict) -> dict:
             img = results['img']
             if len(img.shape) < 3:
                 img = np.expand_dims(img, -1)
-            img = np.ascontiguousarray(img.transpose(2, 0, 1))
-            packed_results['inputs'] = to_tensor(img)
+            if not img.flags.c_contiguous:
+                img = to_tensor(np.ascontiguousarray(img.transpose(2, 0, 1)))
+            else:
+                img = img.transpose(2, 0, 1)
+                img = to_tensor(img).contiguous()
+            packed_results['inputs'] = img
 
         data_sample = SegDataSample()
         if 'gt_seg_map' in results:
-            gt_sem_seg_data = dict(
-                data=to_tensor(results['gt_seg_map'][None,
-                                                     ...].astype(np.int64)))
+            if len(results['gt_seg_map'].shape) == 2:
+                data = to_tensor(results['gt_seg_map'][None,
+                                                       ...].astype(np.int64))
+            else:
+                warnings.warn('Please pay attention your ground truth '
+                              'segmentation map, usually the segmentation '
+                              'map is 2D, but got '
+                              f'{results["gt_seg_map"].shape}')
+                data = to_tensor(results['gt_seg_map'].astype(np.int64))
+            gt_sem_seg_data = dict(data=data)
             data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
 
+        if 'gt_edge_map' in results:
+            gt_edge_data = dict(
+                data=to_tensor(results['gt_edge_map'][None,
+                                                      ...].astype(np.int64)))
+            data_sample.set_data(dict(gt_edge_map=PixelData(**gt_edge_data)))
+
+        if 'gt_depth_map' in results:
+            gt_depth_data = dict(
+                data=to_tensor(results['gt_depth_map'][None, ...]))
+            data_sample.set_data(dict(gt_depth_map=PixelData(**gt_depth_data)))
+
         img_meta = {}
         for key in self.meta_keys:
             if key in results:
                 img_meta[key] = results[key]
         data_sample.set_metainfo(img_meta)
-        packed_results['data_sample'] = data_sample
+        packed_results['data_samples'] = data_sample
 
         return packed_results
 
@@ -87,114 +110,3 @@ def __repr__(self) -> str:
         repr_str = self.__class__.__name__
         repr_str += f'(meta_keys={self.meta_keys})'
         return repr_str
-
-
-@TRANSFORMS.register_module()
-class ImageToTensor(object):
-    """Convert image to :obj:`torch.Tensor` by given keys.
-
-    The dimension order of input image is (H, W, C). The pipeline will convert
-    it to (C, H, W). If only 2 dimension (H, W) is given, the output would be
-    (1, H, W).
-
-    Args:
-        keys (Sequence[str]): Key of images to be converted to Tensor.
-    """
-
-    def __init__(self, keys):
-        self.keys = keys
-
-    def __call__(self, results):
-        """Call function to convert image in results to :obj:`torch.Tensor` and
-        transpose the channel order.
-
-        Args:
-            results (dict): Result dict contains the image data to convert.
-
-        Returns:
-            dict: The result dict contains the image converted
-                to :obj:`torch.Tensor` and transposed to (C, H, W) order.
-        """
-
-        for key in self.keys:
-            img = results[key]
-            if len(img.shape) < 3:
-                img = np.expand_dims(img, -1)
-            results[key] = to_tensor(img.transpose(2, 0, 1))
-        return results
-
-    def __repr__(self):
-        return self.__class__.__name__ + f'(keys={self.keys})'
-
-
-@TRANSFORMS.register_module()
-class Transpose(object):
-    """Transpose some results by given keys.
-
-    Args:
-        keys (Sequence[str]): Keys of results to be transposed.
-        order (Sequence[int]): Order of transpose.
-    """
-
-    def __init__(self, keys, order):
-        self.keys = keys
-        self.order = order
-
-    def __call__(self, results):
-        """Call function to convert image in results to :obj:`torch.Tensor` and
-        transpose the channel order.
-
-        Args:
-            results (dict): Result dict contains the image data to convert.
-
-        Returns:
-            dict: The result dict contains the image converted
-                to :obj:`torch.Tensor` and transposed to (C, H, W) order.
-        """
-
-        for key in self.keys:
-            results[key] = results[key].transpose(self.order)
-        return results
-
-    def __repr__(self):
-        return self.__class__.__name__ + \
-               f'(keys={self.keys}, order={self.order})'
-
-
-@TRANSFORMS.register_module()
-class ToDataContainer(object):
-    """Convert results to :obj:`mmcv.DataContainer` by given fields.
-
-    Args:
-        fields (Sequence[dict]): Each field is a dict like
-            ``dict(key='xxx', **kwargs)``. The ``key`` in result will
-            be converted to :obj:`mmcv.DataContainer` with ``**kwargs``.
-            Default: ``(dict(key='img', stack=True),
-            dict(key='gt_semantic_seg'))``.
-    """
-
-    def __init__(self,
-                 fields=(dict(key='img',
-                              stack=True), dict(key='gt_semantic_seg'))):
-        self.fields = fields
-
-    def __call__(self, results):
-        """Call function to convert data in results to
-        :obj:`mmcv.DataContainer`.
-
-        Args:
-            results (dict): Result dict contains the data to convert.
-
-        Returns:
-            dict: The result dict contains the data converted to
-                :obj:`mmcv.DataContainer`.
-        """
-
-        for field in self.fields:
-            field = field.copy()
-            key = field.pop('key')
-            results[key] = DC(results[key], **field)
-        return results
-
-    def __repr__(self):
-        return self.__class__.__name__ + f'(fields={self.fields})'
diff --git a/mmseg/datasets/transforms/loading.py b/mmseg/datasets/transforms/loading.py
index fbdaaca31d..438b5527f0 100644
--- a/mmseg/datasets/transforms/loading.py
+++ b/mmseg/datasets/transforms/loading.py
@@ -1,11 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
+from typing import Dict, Optional, Union
 
 import mmcv
+import mmengine.fileio as fileio
 import numpy as np
+from mmcv.transforms import BaseTransform
 from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations
+from mmcv.transforms import LoadImageFromFile
 
 from mmseg.registry import TRANSFORMS
+from mmseg.utils import datafrombytes
+
+try:
+    from osgeo import gdal
+except ImportError:
+    gdal = None
 
 
 @TRANSFORMS.register_module()
@@ -49,15 +59,16 @@ class LoadAnnotations(MMCV_LoadAnnotations):
             argument for :func:``mmcv.imfrombytes``.
             See :fun:``mmcv.imfrombytes`` for details.
             Defaults to 'pillow'.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:``mmcv.fileio.FileClient`` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
     """
 
     def __init__(
         self,
         reduce_zero_label=None,
-        file_client_args=dict(backend='disk'),
+        backend_args=None,
         imdecode_backend='pillow',
     ) -> None:
         super().__init__(
@@ -66,14 +77,13 @@ def __init__(
             with_seg=True,
             with_keypoints=False,
             imdecode_backend=imdecode_backend,
-            file_client_args=file_client_args)
+            backend_args=backend_args)
         self.reduce_zero_label = reduce_zero_label
         if self.reduce_zero_label is not None:
             warnings.warn('`reduce_zero_label` will be deprecated, '
                           'if you would like to ignore the zero label, please '
                           'set `reduce_zero_label=True` when dataset '
                           'initialized')
-        self.file_client_args = file_client_args.copy()
         self.imdecode_backend = imdecode_backend
 
     def _load_seg_map(self, results: dict) -> None:
@@ -86,19 +96,12 @@ def _load_seg_map(self, results: dict) -> None:
             dict: The dict contains loaded semantic segmentation annotations.
         """
 
-        img_bytes = self.file_client.get(results['seg_map_path'])
+        img_bytes = fileio.get(
+            results['seg_map_path'], backend_args=self.backend_args)
         gt_semantic_seg = mmcv.imfrombytes(
             img_bytes, flag='unchanged',
             backend=self.imdecode_backend).squeeze().astype(np.uint8)
 
-        # modify if custom classes
-        if results.get('label_map', None) is not None:
-            # Add deep copy to solve bug of repeatedly
-            # replace `gt_semantic_seg`, which is reported in
-            # https://github.com/open-mmlab/mmsegmentation/pull/1445/
-            gt_semantic_seg_copy = gt_semantic_seg.copy()
-            for old_id, new_id in results['label_map'].items():
-                gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
         # reduce zero_label
         if self.reduce_zero_label is None:
             self.reduce_zero_label = results['reduce_zero_label']
@@ -111,12 +114,591 @@ def _load_seg_map(self, results: dict) -> None:
             gt_semantic_seg[gt_semantic_seg == 0] = 255
             gt_semantic_seg = gt_semantic_seg - 1
             gt_semantic_seg[gt_semantic_seg == 254] = 255
+        # modify if custom classes
+        if results.get('label_map', None) is not None:
+            # Add deep copy to solve bug of repeatedly
+            # replace `gt_semantic_seg`, which is reported in
+            # https://github.com/open-mmlab/mmsegmentation/pull/1445/
+            gt_semantic_seg_copy = gt_semantic_seg.copy()
+            for old_id, new_id in results['label_map'].items():
+                gt_semantic_seg[gt_semantic_seg_copy == old_id] = new_id
         results['gt_seg_map'] = gt_semantic_seg
         results['seg_fields'].append('gt_seg_map')
 
     def __repr__(self) -> str:
         repr_str = self.__class__.__name__
-        repr_str += f'(reduce_zero_label={self.reduce_zero_label},'
-        repr_str += f"imdecode_backend='{self.imdecode_backend}')"
-        repr_str += f'file_client_args={self.file_client_args})'
+        repr_str += f'(reduce_zero_label={self.reduce_zero_label}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'backend_args={self.backend_args})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadImageFromNDArray(LoadImageFromFile):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        img = results['img']
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img_path'] = None
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalImageFromFile(BaseTransform):
+    """Load an biomedical mage from file.
+
+    Required Keys:
+
+    - img_path
+
+    Added Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities, and data type is float32
+        if set to_float32 = True, or float64 if decode_backend is 'nifti' and
+        to_float32 is False.
+    - img_shape
+    - ori_shape
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an float64 array.
+            Defaults to True.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'nifti',
+                 to_xyz: bool = False,
+                 to_float32: bool = True,
+                 backend_args: Optional[dict] = None) -> None:
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.to_float32 = to_float32
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+
+        data_bytes = fileio.get(filename, self.backend_args)
+        img = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        if len(img.shape) == 3:
+            img = img[None, ...]
+
+        if self.decode_backend == 'nifti':
+            img = img.transpose(0, 3, 2, 1)
+
+        if self.to_xyz:
+            img = img.transpose(0, 3, 2, 1)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[1:]
+        results['ori_shape'] = img.shape[1:]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalAnnotation(BaseTransform):
+    """Load ``seg_map`` annotation provided by biomedical dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'gt_seg_map': np.ndarray (X, Y, Z) or (Z, Y, X)
+        }
+
+    Required Keys:
+
+    - seg_map_path
+
+    Added Keys:
+
+    - gt_seg_map (np.ndarray): Biomedical seg map with shape (Z, Y, X) by
+        default, and data type is float32 if set to_float32 = True, or
+        float64 if decode_backend is 'nifti' and to_float32 is False.
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        to_float32 (bool): Whether to convert the loaded seg map to a float32
+            numpy array. If set to False, the loaded image is an float64 array.
+            Defaults to True.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See :class:`mmengine.fileio` for details.
+            Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'nifti',
+                 to_xyz: bool = False,
+                 to_float32: bool = True,
+                 backend_args: Optional[dict] = None) -> None:
+        super().__init__()
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.to_float32 = to_float32
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        data_bytes = fileio.get(results['seg_map_path'], self.backend_args)
+        gt_seg_map = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            gt_seg_map = gt_seg_map.astype(np.float32)
+
+        if self.decode_backend == 'nifti':
+            gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+        if self.to_xyz:
+            gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+        results['gt_seg_map'] = gt_seg_map
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadBiomedicalData(BaseTransform):
+    """Load an biomedical image and annotation from file.
+
+    The loading data format is as the following:
+
+    .. code-block:: python
+
+        {
+            'img': np.ndarray data[:-1, X, Y, Z]
+            'seg_map': np.ndarray data[-1, X, Y, Z]
+        }
+
+
+    Required Keys:
+
+    - img_path
+
+    Added Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+    - img_shape
+    - ori_shape
+
+    Args:
+        with_seg (bool): Whether to parse and load the semantic segmentation
+            annotation. Defaults to False.
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy'and 'nifti', and there is a convention that when backend is
+            'nifti' the axis of data loaded is XYZ, and when backend is
+            'numpy', the the axis is ZYX. The data will be transposed if the
+            backend is 'nifti'. Defaults to 'nifti'.
+        to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
+            Defaults to False.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 with_seg=False,
+                 decode_backend: str = 'numpy',
+                 to_xyz: bool = False,
+                 backend_args: Optional[dict] = None) -> None:  # noqa
+        self.with_seg = with_seg
+        self.decode_backend = decode_backend
+        self.to_xyz = to_xyz
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        data_bytes = fileio.get(results['img_path'], self.backend_args)
+        data = datafrombytes(data_bytes, backend=self.decode_backend)
+        # img is 4D data (N, X, Y, Z), N is the number of protocol
+        img = data[:-1, :]
+
+        if self.decode_backend == 'nifti':
+            img = img.transpose(0, 3, 2, 1)
+
+        if self.to_xyz:
+            img = img.transpose(0, 3, 2, 1)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[1:]
+        results['ori_shape'] = img.shape[1:]
+
+        if self.with_seg:
+            gt_seg_map = data[-1, :]
+            if self.decode_backend == 'nifti':
+                gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+
+            if self.to_xyz:
+                gt_seg_map = gt_seg_map.transpose(2, 1, 0)
+            results['gt_seg_map'] = gt_seg_map
+        return results
+
+    def __repr__(self) -> str:
+        repr_str = (f'{self.__class__.__name__}('
+                    f'with_seg={self.with_seg}, '
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_xyz={self.to_xyz}, '
+                    f'backend_args={self.backend_args})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class InferencerLoader(BaseTransform):
+    """Load an image from ``results['img']``.
+
+    Similar with :obj:`LoadImageFromFile`, but the image has been loaded as
+    :obj:`np.ndarray` in ``results['img']``. Can be used when loading image
+    from webcam.
+
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_path
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is an uint8 array.
+            Defaults to False.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        super().__init__()
+        self.from_file = TRANSFORMS.build(
+            dict(type='LoadImageFromFile', **kwargs))
+        self.from_ndarray = TRANSFORMS.build(
+            dict(type='LoadImageFromNDArray', **kwargs))
+
+    def transform(self, single_input: Union[str, np.ndarray, dict]) -> dict:
+        """Transform function to add image meta information.
+
+        Args:
+            results (dict): Result dict with Webcam read image in
+                ``results['img']``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+        if isinstance(single_input, str):
+            inputs = dict(img_path=single_input)
+        elif isinstance(single_input, np.ndarray):
+            inputs = dict(img=single_input)
+        elif isinstance(single_input, dict):
+            inputs = single_input
+        else:
+            raise NotImplementedError
+
+        if 'img' in inputs:
+            return self.from_ndarray(inputs)
+        return self.from_file(inputs)
+
+
+@TRANSFORMS.register_module()
+class LoadSingleRSImageFromFile(BaseTransform):
+    """Load a Remote Sensing mage from file.
+
+    Required Keys:
+
+    - img_path
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is a float64 array.
+            Defaults to True.
+    """
+
+    def __init__(self, to_float32: bool = True):
+        self.to_float32 = to_float32
+
+        if gdal is None:
+            raise RuntimeError('gdal is not installed')
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        ds = gdal.Open(filename)
+        if ds is None:
+            raise Exception(f'Unable to open file: {filename}')
+        img = np.einsum('ijk->jki', ds.ReadAsArray())
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+
+        results['img'] = img
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadMultipleRSImageFromFile(BaseTransform):
+    """Load two Remote Sensing mage from file.
+
+    Required Keys:
+
+    - img_path
+    - img_path2
+
+    Modified Keys:
+
+    - img
+    - img2
+    - img_shape
+    - ori_shape
+
+    Args:
+        to_float32 (bool): Whether to convert the loaded image to a float32
+            numpy array. If set to False, the loaded image is a float64 array.
+            Defaults to True.
+    """
+
+    def __init__(self, to_float32: bool = True):
+        if gdal is None:
+            raise RuntimeError('gdal is not installed')
+        self.to_float32 = to_float32
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load image.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded image and meta information.
+        """
+
+        filename = results['img_path']
+        filename2 = results['img_path2']
+
+        ds = gdal.Open(filename)
+        ds2 = gdal.Open(filename2)
+
+        if ds is None:
+            raise Exception(f'Unable to open file: {filename}')
+        if ds2 is None:
+            raise Exception(f'Unable to open file: {filename2}')
+
+        img = np.einsum('ijk->jki', ds.ReadAsArray())
+        img2 = np.einsum('ijk->jki', ds2.ReadAsArray())
+
+        if self.to_float32:
+            img = img.astype(np.float32)
+            img2 = img2.astype(np.float32)
+
+        if img.shape != img2.shape:
+            raise Exception(f'Image shapes do not match:'
+                            f' {img.shape} vs {img2.shape}')
+
+        results['img'] = img
+        results['img2'] = img2
+        results['img_shape'] = img.shape[:2]
+        results['ori_shape'] = img.shape[:2]
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f'to_float32={self.to_float32})')
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class LoadDepthAnnotation(BaseTransform):
+    """Load ``depth_map`` annotation provided by depth estimation dataset.
+
+    The annotation format is as the following:
+
+    .. code-block:: python
+
+        {
+            'gt_depth_map': np.ndarray [Y, X]
+        }
+
+    Required Keys:
+
+    - seg_depth_path
+
+    Added Keys:
+
+    - gt_depth_map (np.ndarray): Depth map with shape (Y, X) by
+        default, and data type is float32 if set to_float32 = True.
+    - depth_rescale_factor (float): The rescale factor of depth map, which
+        can be used to recover the original value of depth map.
+
+    Args:
+        decode_backend (str): The data decoding backend type. Options are
+            'numpy', 'nifti', and 'cv2'. Defaults to 'cv2'.
+        to_float32 (bool): Whether to convert the loaded depth map to a float32
+            numpy array. If set to False, the loaded image is an uint16 array.
+            Defaults to True.
+        depth_rescale_factor (float): Factor to rescale the depth value to
+            limit the range. Defaults to 1.0.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See :class:`mmengine.fileio` for details.
+            Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 decode_backend: str = 'cv2',
+                 to_float32: bool = True,
+                 depth_rescale_factor: float = 1.0,
+                 backend_args: Optional[dict] = None) -> None:
+        super().__init__()
+        self.decode_backend = decode_backend
+        self.to_float32 = to_float32
+        self.depth_rescale_factor = depth_rescale_factor
+        self.backend_args = backend_args.copy() if backend_args else None
+
+    def transform(self, results: Dict) -> Dict:
+        """Functions to load depth map.
+
+        Args:
+            results (dict): Result dict from :obj:``mmcv.BaseDataset``.
+
+        Returns:
+            dict: The dict contains loaded depth map.
+        """
+        data_bytes = fileio.get(results['depth_map_path'], self.backend_args)
+        gt_depth_map = datafrombytes(data_bytes, backend=self.decode_backend)
+
+        if self.to_float32:
+            gt_depth_map = gt_depth_map.astype(np.float32)
+
+        gt_depth_map *= self.depth_rescale_factor
+        results['gt_depth_map'] = gt_depth_map
+        results['seg_fields'].append('gt_depth_map')
+        results['depth_rescale_factor'] = self.depth_rescale_factor
+        return results
+
+    def __repr__(self):
+        repr_str = (f'{self.__class__.__name__}('
+                    f"decode_backend='{self.decode_backend}', "
+                    f'to_float32={self.to_float32}, '
+                    f'backend_args={self.backend_args})')
         return repr_str
diff --git a/mmseg/datasets/transforms/transforms.py b/mmseg/datasets/transforms/transforms.py
index 52c61953b6..082ae5b440 100644
--- a/mmseg/datasets/transforms/transforms.py
+++ b/mmseg/datasets/transforms/transforms.py
@@ -1,21 +1,49 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
-from typing import Sequence, Tuple, Union
+import inspect
+import warnings
+from typing import Dict, List, Optional, Sequence, Tuple, Union
 
+import cv2
 import mmcv
+import mmengine
 import numpy as np
+from mmcv.transforms import RandomFlip as MMCV_RandomFlip
+from mmcv.transforms import Resize as MMCV_Resize
 from mmcv.transforms.base import BaseTransform
 from mmcv.transforms.utils import cache_randomness
-from mmcv.utils import is_tuple_of
+from mmengine.utils import is_tuple_of
 from numpy import random
+from scipy.ndimage import gaussian_filter
 
+from mmseg.datasets.dataset_wrappers import MultiImageMixDataset
 from mmseg.registry import TRANSFORMS
 
+try:
+    import albumentations
+    from albumentations import Compose
+    ALBU_INSTALLED = True
+except ImportError:
+    albumentations = None
+    Compose = None
+    ALBU_INSTALLED = False
+
 
 @TRANSFORMS.register_module()
-class ResizeToMultiple(object):
+class ResizeToMultiple(BaseTransform):
     """Resize images & seg to multiple of divisor.
 
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - pad_shape
+
     Args:
         size_divisor (int): images and gt seg maps need to resize to multiple
             of size_divisor. Default: 32.
@@ -27,7 +55,7 @@ def __init__(self, size_divisor=32, interpolation=None):
         self.size_divisor = size_divisor
         self.interpolation = interpolation
 
-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
         """Call function to resize images, semantic segmentation map to
         multiple of size divisor.
 
@@ -47,8 +75,8 @@ def __call__(self, results):
             if self.interpolation else 'bilinear')
 
         results['img'] = img
-        results['img_shape'] = img.shape
-        results['pad_shape'] = img.shape
+        results['img_shape'] = img.shape[:2]
+        results['pad_shape'] = img.shape[:2]
 
         # Align segmentation map to multiple of size divisor.
         for key in results.get('seg_fields', []):
@@ -70,9 +98,17 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class Rerange(object):
+class Rerange(BaseTransform):
     """Rerange the image pixel value.
 
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
     Args:
         min_value (float or int): Minimum value of the reranged image.
             Default: 0.
@@ -87,7 +123,7 @@ def __init__(self, min_value=0, max_value=255):
         self.min_value = min_value
         self.max_value = max_value
 
-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
         """Call function to rerange images.
 
         Args:
@@ -116,12 +152,20 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class CLAHE(object):
+class CLAHE(BaseTransform):
     """Use CLAHE method to process the image.
 
     See `ZUIDERVELD,K. Contrast Limited Adaptive Histogram Equalization[J].
     Graphics Gems, 1994:474-485.` for more information.
 
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
     Args:
         clip_limit (float): Threshold for contrast limiting. Default: 40.0.
         tile_grid_size (tuple[int]): Size of grid for histogram equalization.
@@ -136,7 +180,7 @@ def __init__(self, clip_limit=40.0, tile_grid_size=(8, 8)):
         assert len(tile_grid_size) == 2
         self.tile_grid_size = tile_grid_size
 
-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
         """Call function to Use CLAHE method process images.
 
         Args:
@@ -155,7 +199,7 @@ def __call__(self, results):
 
     def __repr__(self):
         repr_str = self.__class__.__name__
-        repr_str += f'(clip_limit={self.clip_limit}, '\
+        repr_str += f'(clip_limit={self.clip_limit}, ' \
                     f'tile_grid_size={self.tile_grid_size})'
         return repr_str
 
@@ -167,13 +211,13 @@ class RandomCrop(BaseTransform):
     Required Keys:
 
     - img
-    - gt_semantic_seg
+    - gt_seg_map
 
     Modified Keys:
 
     - img
     - img_shape
-    - gt_semantic_seg
+    - gt_seg_map
 
 
     Args:
@@ -283,9 +327,9 @@ def transform(self, results: dict) -> dict:
         # crop semantic seg
         for key in results.get('seg_fields', []):
             results[key] = self.crop(results[key], crop_bbox)
-        img_shape = img.shape
+
         results['img'] = img
-        results['img_shape'] = img_shape
+        results['img_shape'] = img.shape[:2]
         return results
 
     def __repr__(self):
@@ -293,9 +337,19 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class RandomRotate(object):
+class RandomRotate(BaseTransform):
     """Rotate the image & seg.
 
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
     Args:
         prob (float): The rotation probability.
         degree (float, tuple[float]): Range of degrees to select from. If
@@ -332,7 +386,12 @@ def __init__(self,
         self.center = center
         self.auto_bound = auto_bound
 
-    def __call__(self, results):
+    @cache_randomness
+    def generate_degree(self):
+        return np.random.rand() < self.prob, np.random.uniform(
+            min(*self.degree), max(*self.degree))
+
+    def transform(self, results: dict) -> dict:
         """Call function to rotate image, semantic segmentation maps.
 
         Args:
@@ -342,8 +401,7 @@ def __call__(self, results):
             dict: Rotated results.
         """
 
-        rotate = True if np.random.rand() < self.prob else False
-        degree = np.random.uniform(min(*self.degree), max(*self.degree))
+        rotate, degree = self.generate_degree()
         if rotate:
             # rotate image
             results['img'] = mmcv.imrotate(
@@ -376,9 +434,18 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class RGB2Gray(object):
+class RGB2Gray(BaseTransform):
     """Convert RGB image to grayscale image.
 
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+    - img_shape
+
     This transform calculate the weighted mean of input image channels with
     ``weights`` and then expand the channels to ``out_channels``. When
     ``out_channels`` is None, the number of output channels is the same as
@@ -399,7 +466,7 @@ def __init__(self, out_channels=None, weights=(0.299, 0.587, 0.114)):
             assert isinstance(item, (float, int))
         self.weights = weights
 
-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
         """Call function to convert RGB image to grayscale image.
 
         Args:
@@ -431,9 +498,17 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class AdjustGamma(object):
+class AdjustGamma(BaseTransform):
     """Using gamma correction to process the image.
 
+    Required Keys:
+
+    - img
+
+    Modified Keys:
+
+    - img
+
     Args:
         gamma (float or int): Gamma value used in gamma correction.
             Default: 1.0.
@@ -447,7 +522,7 @@ def __init__(self, gamma=1.0):
         self.table = np.array([(i / 255.0)**inv_gamma * 255
                                for i in np.arange(256)]).astype('uint8')
 
-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
         """Call function to process the image with gamma correction.
 
         Args:
@@ -467,9 +542,17 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class SegRescale(object):
+class SegRescale(BaseTransform):
     """Rescale semantic segmentation maps.
 
+    Required Keys:
+
+    - gt_seg_map
+
+    Modified Keys:
+
+    - gt_seg_map
+
     Args:
         scale_factor (float): The scale factor of the final output.
     """
@@ -477,7 +560,7 @@ class SegRescale(object):
     def __init__(self, scale_factor=1):
         self.scale_factor = scale_factor
 
-    def __call__(self, results):
+    def transform(self, results: dict) -> dict:
         """Call function to scale the semantic segmentation map.
 
         Args:
@@ -667,11 +750,22 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class RandomCutOut(object):
+class RandomCutOut(BaseTransform):
     """CutOut operation.
 
     Randomly drop some regions of image used in
     `Cutout <https://arxiv.org/abs/1708.04552>`_.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
     Args:
         prob (float): cutout probability.
         n_holes (int | tuple[int, int]): Number of regions to be dropped.
@@ -721,16 +815,38 @@ def __init__(self,
         if not isinstance(self.candidates, list):
             self.candidates = [self.candidates]
 
-    def __call__(self, results):
+    @cache_randomness
+    def do_cutout(self):
+        return np.random.rand() < self.prob
+
+    @cache_randomness
+    def generate_patches(self, results):
+        cutout = self.do_cutout()
+
+        h, w, _ = results['img'].shape
+        if cutout:
+            n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
+        else:
+            n_holes = 0
+        x1_lst = []
+        y1_lst = []
+        index_lst = []
+        for _ in range(n_holes):
+            x1_lst.append(np.random.randint(0, w))
+            y1_lst.append(np.random.randint(0, h))
+            index_lst.append(np.random.randint(0, len(self.candidates)))
+        return cutout, n_holes, x1_lst, y1_lst, index_lst
+
+    def transform(self, results: dict) -> dict:
         """Call function to drop some regions of image."""
-        cutout = True if np.random.rand() < self.prob else False
+        cutout, n_holes, x1_lst, y1_lst, index_lst = self.generate_patches(
+            results)
         if cutout:
             h, w, c = results['img'].shape
-            n_holes = np.random.randint(self.n_holes[0], self.n_holes[1] + 1)
-            for _ in range(n_holes):
-                x1 = np.random.randint(0, w)
-                y1 = np.random.randint(0, h)
-                index = np.random.randint(0, len(self.candidates))
+            for i in range(n_holes):
+                x1 = x1_lst[i]
+                y1 = y1_lst[i]
+                index = index_lst[i]
                 if not self.with_ratio:
                     cutout_w, cutout_h = self.candidates[index]
                 else:
@@ -759,7 +875,231 @@ def __repr__(self):
 
 
 @TRANSFORMS.register_module()
-class RandomMosaic(object):
+class RandomRotFlip(BaseTransform):
+    """Rotate and flip the image & seg or just rotate the image & seg.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+
+    Args:
+        rotate_prob (float): The probability of rotate image.
+        flip_prob (float): The probability of rotate&flip image.
+        degree (float, tuple[float]): Range of degrees to select from. If
+            degree is a number instead of tuple like (min, max),
+            the range of degree will be (``-degree``, ``+degree``)
+    """
+
+    def __init__(self, rotate_prob=0.5, flip_prob=0.5, degree=(-20, 20)):
+        self.rotate_prob = rotate_prob
+        self.flip_prob = flip_prob
+        assert 0 <= rotate_prob <= 1 and 0 <= flip_prob <= 1
+        if isinstance(degree, (float, int)):
+            assert degree > 0, f'degree {degree} should be positive'
+            self.degree = (-degree, degree)
+        else:
+            self.degree = degree
+        assert len(self.degree) == 2, f'degree {self.degree} should be a ' \
+                                      f'tuple of (min, max)'
+
+    def random_rot_flip(self, results: dict) -> dict:
+        k = np.random.randint(0, 4)
+        results['img'] = np.rot90(results['img'], k)
+        for key in results.get('seg_fields', []):
+            results[key] = np.rot90(results[key], k)
+        axis = np.random.randint(0, 2)
+        results['img'] = np.flip(results['img'], axis=axis).copy()
+        for key in results.get('seg_fields', []):
+            results[key] = np.flip(results[key], axis=axis).copy()
+        return results
+
+    def random_rotate(self, results: dict) -> dict:
+        angle = np.random.uniform(min(*self.degree), max(*self.degree))
+        results['img'] = mmcv.imrotate(results['img'], angle=angle)
+        for key in results.get('seg_fields', []):
+            results[key] = mmcv.imrotate(results[key], angle=angle)
+        return results
+
+    def transform(self, results: dict) -> dict:
+        """Call function to rotate or rotate & flip image, semantic
+        segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Rotated or rotated & flipped results.
+        """
+        rotate_flag = 0
+        if random.random() < self.rotate_prob:
+            results = self.random_rotate(results)
+            rotate_flag = 1
+        if random.random() < self.flip_prob and rotate_flag == 0:
+            results = self.random_rot_flip(results)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(rotate_prob={self.rotate_prob}, ' \
+                    f'flip_prob={self.flip_prob}, ' \
+                    f'degree={self.degree})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomFlip(MMCV_RandomFlip):
+    """Flip the image & bbox & segmentation map. Added or Updated
+    keys: flip, flip_direction, img, gt_bboxes, gt_seg_map, and gt_depth_map.
+    There are 3 flip modes:
+
+    - ``prob`` is float, ``direction`` is string: the image will be
+      ``direction``ly flipped with probability of ``prob`` .
+      E.g., ``prob=0.5``, ``direction='horizontal'``,
+      then image will be horizontally flipped with probability of 0.5.
+
+    - ``prob`` is float, ``direction`` is list of string: the image will
+      be ``direction[i]``ly flipped with probability of
+      ``prob/len(direction)``.
+      E.g., ``prob=0.5``, ``direction=['horizontal', 'vertical']``,
+      then image will be horizontally flipped with probability of 0.25,
+      vertically with probability of 0.25.
+
+    - ``prob`` is list of float, ``direction`` is list of string:
+      given ``len(prob) == len(direction)``, the image will
+      be ``direction[i]``ly flipped with probability of ``prob[i]``.
+      E.g., ``prob=[0.3, 0.5]``, ``direction=['horizontal',
+      'vertical']``, then image will be horizontally flipped with
+      probability of 0.3, vertically with probability of 0.5.
+
+    Required Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_bboxes (optional)
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Added Keys:
+
+    - flip
+    - flip_direction
+    - swap_seg_labels (optional)
+
+    Args:
+        prob (float | list[float], optional): The flipping probability.
+            Defaults to None.
+        direction(str | list[str]): The flipping direction. Options
+            If input is a list, the length must equal ``prob``. Each
+            element in ``prob`` indicates the flip probability of
+            corresponding direction. Defaults to 'horizontal'.
+        swap_seg_labels (list, optional): The label pair need to be swapped
+            for ground truth, like 'left arm' and 'right arm' need to be
+            swapped after horizontal flipping. For example, ``[(1, 5)]``,
+            where 1/5 is the label of the left/right arm. Defaults to None.
+    """
+
+    def _flip(self, results: dict) -> None:
+        """Flip images, bounding boxes and semantic segmentation map."""
+        # flip image
+        results['img'] = mmcv.imflip(
+            results['img'], direction=results['flip_direction'])
+
+        img_shape = results['img'].shape[:2]
+
+        # flip bboxes
+        if results.get('gt_bboxes', None) is not None:
+            results['gt_bboxes'] = self._flip_bbox(results['gt_bboxes'],
+                                                   img_shape,
+                                                   results['flip_direction'])
+
+        # flip seg map
+        for key in results.get('seg_fields', []):
+            if results.get(key, None) is not None:
+                results[key] = self._flip_seg_map(
+                    results[key], direction=results['flip_direction']).copy()
+                results['swap_seg_labels'] = self.swap_seg_labels
+
+
+@TRANSFORMS.register_module()
+class Resize(MMCV_Resize):
+    """Resize images & seg & depth map.
+
+    This transform resizes the input image according to ``scale`` or
+    ``scale_factor``. Seg map, depth map and other relative annotations are
+    then resized with the same scale factor.
+    if ``scale`` and ``scale_factor`` are both set, it will use ``scale`` to
+    resize.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+    - gt_depth_map (optional)
+
+    Modified Keys:
+
+    - img
+    - gt_seg_map
+    - gt_depth_map
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+    Args:
+        scale (int or tuple): Images scales for resizing. Defaults to None
+        scale_factor (float or tuple[float]): Scale factors for resizing.
+            Defaults to None.
+        keep_ratio (bool): Whether to keep the aspect ratio when resizing the
+            image. Defaults to False.
+        clip_object_border (bool): Whether to clip the objects
+            outside the border of the image. In some dataset like MOT17, the gt
+            bboxes are allowed to cross the border of images. Therefore, we
+            don't need to clip the gt bboxes in these cases. Defaults to True.
+        backend (str): Image resize backend, choices are 'cv2' and 'pillow'.
+            These two backends generates slightly different results. Defaults
+            to 'cv2'.
+        interpolation (str): Interpolation method, accepted values are
+            "nearest", "bilinear", "bicubic", "area", "lanczos" for 'cv2'
+            backend, "nearest", "bilinear" for 'pillow' backend. Defaults
+            to 'bilinear'.
+    """
+
+    def _resize_seg(self, results: dict) -> None:
+        """Resize semantic segmentation map with ``results['scale']``."""
+        for seg_key in results.get('seg_fields', []):
+            if results.get(seg_key, None) is not None:
+                if self.keep_ratio:
+                    gt_seg = mmcv.imrescale(
+                        results[seg_key],
+                        results['scale'],
+                        interpolation='nearest',
+                        backend=self.backend)
+                else:
+                    gt_seg = mmcv.imresize(
+                        results[seg_key],
+                        results['scale'],
+                        interpolation='nearest',
+                        backend=self.backend)
+                results[seg_key] = gt_seg
+
+
+@TRANSFORMS.register_module()
+class RandomMosaic(BaseTransform):
     """Mosaic augmentation. Given 4 images, mosaic transform combines them into
     one output image. The output image is composed of the parts from each sub-
     image.
@@ -789,6 +1129,19 @@ class RandomMosaic(object):
             sample another 3 images from the custom dataset.
          3. Sub image will be cropped if image is larger than mosaic patch
 
+    Required Keys:
+
+    - img
+    - gt_seg_map
+    - mix_results
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - ori_shape
+    - gt_seg_map
+
     Args:
         prob (float): mosaic probability.
         img_scale (Sequence[int]): Image size after mosaic pipeline of
@@ -815,7 +1168,11 @@ def __init__(self,
         self.pad_val = pad_val
         self.seg_pad_val = seg_pad_val
 
-    def __call__(self, results):
+    @cache_randomness
+    def do_mosaic(self):
+        return np.random.rand() < self.prob
+
+    def transform(self, results: dict) -> dict:
         """Call function to make a mosaic of image.
 
         Args:
@@ -824,26 +1181,35 @@ def __call__(self, results):
         Returns:
             dict: Result dict with mosaic transformed.
         """
-        mosaic = True if np.random.rand() < self.prob else False
+        mosaic = self.do_mosaic()
         if mosaic:
             results = self._mosaic_transform_img(results)
             results = self._mosaic_transform_seg(results)
         return results
 
-    def get_indexes(self, dataset):
-        """Call function to collect indexes.
+    def get_indices(self, dataset: MultiImageMixDataset) -> list:
+        """Call function to collect indices.
 
         Args:
             dataset (:obj:`MultiImageMixDataset`): The dataset.
 
         Returns:
-            list: indexes.
+            list: indices.
         """
 
-        indexes = [random.randint(0, len(dataset)) for _ in range(3)]
-        return indexes
+        indices = [random.randint(0, len(dataset)) for _ in range(3)]
+        return indices
 
-    def _mosaic_transform_img(self, results):
+    @cache_randomness
+    def generate_mosaic_center(self):
+        # mosaic center x, y
+        center_x = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[1])
+        center_y = int(
+            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        return center_x, center_y
+
+    def _mosaic_transform_img(self, results: dict) -> dict:
         """Mosaic transform function.
 
         Args:
@@ -855,8 +1221,9 @@ def _mosaic_transform_img(self, results):
 
         assert 'mix_results' in results
         if len(results['img'].shape) == 3:
+            c = results['img'].shape[2]
             mosaic_img = np.full(
-                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2), 3),
+                (int(self.img_scale[0] * 2), int(self.img_scale[1] * 2), c),
                 self.pad_val,
                 dtype=results['img'].dtype)
         else:
@@ -866,10 +1233,7 @@ def _mosaic_transform_img(self, results):
                 dtype=results['img'].dtype)
 
         # mosaic center x, y
-        self.center_x = int(
-            random.uniform(*self.center_ratio_range) * self.img_scale[1])
-        self.center_y = int(
-            random.uniform(*self.center_ratio_range) * self.img_scale[0])
+        self.center_x, self.center_y = self.generate_mosaic_center()
         center_position = (self.center_x, self.center_y)
 
         loc_strs = ('top_left', 'top_right', 'bottom_left', 'bottom_right')
@@ -902,7 +1266,7 @@ def _mosaic_transform_img(self, results):
 
         return results
 
-    def _mosaic_transform_seg(self, results):
+    def _mosaic_transform_seg(self, results: dict) -> dict:
         """Mosaic transform function for label annotations.
 
         Args:
@@ -946,14 +1310,15 @@ def _mosaic_transform_seg(self, results):
                 x1_c, y1_c, x2_c, y2_c = crop_coord
 
                 # crop and paste image
-                mosaic_seg[y1_p:y2_p, x1_p:x2_p] = gt_seg_i[y1_c:y2_c,
-                                                            x1_c:x2_c]
+                mosaic_seg[y1_p:y2_p, x1_p:x2_p] = \
+                    gt_seg_i[y1_c:y2_c, x1_c:x2_c]
 
             results[key] = mosaic_seg
 
         return results
 
-    def _mosaic_combine(self, loc, center_position_xy, img_shape_wh):
+    def _mosaic_combine(self, loc: str, center_position_xy: Sequence[float],
+                        img_shape_wh: Sequence[int]) -> tuple:
         """Calculate global coordinate of mosaic image and local coordinate of
         cropped sub-image.
 
@@ -1023,3 +1388,1127 @@ def __repr__(self):
         repr_str += f'pad_val={self.pad_val}, '
         repr_str += f'seg_pad_val={self.pad_val})'
         return repr_str
+
+
+@TRANSFORMS.register_module()
+class GenerateEdge(BaseTransform):
+    """Generate Edge for CE2P approach.
+
+    Edge will be used to calculate loss of
+    `CE2P <https://arxiv.org/abs/1809.05996>`_.
+
+    Modified from https://github.com/liutinglt/CE2P/blob/master/dataset/target_generation.py # noqa:E501
+
+    Required Keys:
+
+        - img_shape
+        - gt_seg_map
+
+    Added Keys:
+        - gt_edge_map (np.ndarray, uint8): The edge annotation generated from the
+            seg map by extracting border between different semantics.
+
+    Args:
+        edge_width (int): The width of edge. Default to 3.
+        ignore_index (int): Index that will be ignored. Default to 255.
+    """
+
+    def __init__(self, edge_width: int = 3, ignore_index: int = 255) -> None:
+        super().__init__()
+        self.edge_width = edge_width
+        self.ignore_index = ignore_index
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to generate edge from segmentation map.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with edge mask.
+        """
+        h, w = results['img_shape']
+        edge = np.zeros((h, w), dtype=np.uint8)
+        seg_map = results['gt_seg_map']
+
+        # down
+        edge_down = edge[1:h, :]
+        edge_down[(seg_map[1:h, :] != seg_map[:h - 1, :])
+                  & (seg_map[1:h, :] != self.ignore_index) &
+                  (seg_map[:h - 1, :] != self.ignore_index)] = 1
+        # left
+        edge_left = edge[:, :w - 1]
+        edge_left[(seg_map[:, :w - 1] != seg_map[:, 1:w])
+                  & (seg_map[:, :w - 1] != self.ignore_index) &
+                  (seg_map[:, 1:w] != self.ignore_index)] = 1
+        # up_left
+        edge_upleft = edge[:h - 1, :w - 1]
+        edge_upleft[(seg_map[:h - 1, :w - 1] != seg_map[1:h, 1:w])
+                    & (seg_map[:h - 1, :w - 1] != self.ignore_index) &
+                    (seg_map[1:h, 1:w] != self.ignore_index)] = 1
+        # up_right
+        edge_upright = edge[:h - 1, 1:w]
+        edge_upright[(seg_map[:h - 1, 1:w] != seg_map[1:h, :w - 1])
+                     & (seg_map[:h - 1, 1:w] != self.ignore_index) &
+                     (seg_map[1:h, :w - 1] != self.ignore_index)] = 1
+
+        kernel = cv2.getStructuringElement(cv2.MORPH_RECT,
+                                           (self.edge_width, self.edge_width))
+        edge = cv2.dilate(edge, kernel)
+
+        results['gt_edge_map'] = edge
+        results['edge_width'] = self.edge_width
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'edge_width={self.edge_width}, '
+        repr_str += f'ignore_index={self.ignore_index})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ResizeShortestEdge(BaseTransform):
+    """Resize the image and mask while keeping the aspect ratio unchanged.
+
+    Modified from https://github.com/facebookresearch/detectron2/blob/main/detectron2/data/transforms/augmentation_impl.py#L130 # noqa:E501
+    Copyright (c) Facebook, Inc. and its affiliates.
+    Licensed under the Apache-2.0 License
+
+    This transform attempts to scale the shorter edge to the given
+    `scale`, as long as the longer edge does not exceed `max_size`.
+    If `max_size` is reached, then downscale so that the longer
+    edge does not exceed `max_size`.
+
+    Required Keys:
+
+    - img
+    - gt_seg_map (optional)
+
+    Modified Keys:
+
+    - img
+    - img_shape
+    - gt_seg_map (optional))
+
+    Added Keys:
+
+    - scale
+    - scale_factor
+    - keep_ratio
+
+
+    Args:
+        scale (Union[int, Tuple[int, int]]): The target short edge length.
+            If it's tuple, will select the min value as the short edge length.
+        max_size (int): The maximum allowed longest edge length.
+    """
+
+    def __init__(self, scale: Union[int, Tuple[int, int]],
+                 max_size: int) -> None:
+        super().__init__()
+        self.scale = scale
+        self.max_size = max_size
+
+        # Create a empty Resize object
+        self.resize = TRANSFORMS.build({
+            'type': 'Resize',
+            'scale': 0,
+            'keep_ratio': True
+        })
+
+    def _get_output_shape(self, img, short_edge_length) -> Tuple[int, int]:
+        """Compute the target image shape with the given `short_edge_length`.
+
+        Args:
+            img (np.ndarray): The input image.
+            short_edge_length (Union[int, Tuple[int, int]]): The target short
+                edge length. If it's tuple, will select the min value as the
+                short edge length.
+        """
+        h, w = img.shape[:2]
+        if isinstance(short_edge_length, int):
+            size = short_edge_length * 1.0
+        elif isinstance(short_edge_length, tuple):
+            size = min(short_edge_length) * 1.0
+        scale = size / min(h, w)
+        if h < w:
+            new_h, new_w = size, scale * w
+        else:
+            new_h, new_w = scale * h, size
+
+        if max(new_h, new_w) > self.max_size:
+            scale = self.max_size * 1.0 / max(new_h, new_w)
+            new_h *= scale
+            new_w *= scale
+
+        new_h = int(new_h + 0.5)
+        new_w = int(new_w + 0.5)
+        return (new_w, new_h)
+
+    def transform(self, results: Dict) -> Dict:
+        self.resize.scale = self._get_output_shape(results['img'], self.scale)
+        return self.resize(results)
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DRandomCrop(BaseTransform):
+    """Crop the input patch for medical image & segmentation mask.
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+        N is the number of modalities, and data type is float32.
+    - gt_seg_map (np.ndarray, optional): Biomedical semantic segmentation mask
+        with shape (Z, Y, X).
+
+    Modified Keys:
+
+        - img
+        - img_shape
+        - gt_seg_map (optional)
+
+    Args:
+        crop_shape (Union[int, Tuple[int, int, int]]):  Expected size after
+            cropping with the format of (z, y, x). If set to an integer,
+            then cropping width and height are equal to this integer.
+        keep_foreground (bool): If keep_foreground is True, it will sample a
+            voxel of foreground classes randomly, and will take it as the
+            center of the crop bounding-box. Default to True.
+    """
+
+    def __init__(self,
+                 crop_shape: Union[int, Tuple[int, int, int]],
+                 keep_foreground: bool = True):
+        super().__init__()
+        assert isinstance(crop_shape, int) or (
+            isinstance(crop_shape, tuple) and len(crop_shape) == 3
+        ), 'The expected crop_shape is an integer, or a tuple containing '
+        'three integers'
+
+        if isinstance(crop_shape, int):
+            crop_shape = (crop_shape, crop_shape, crop_shape)
+        assert crop_shape[0] > 0 and crop_shape[1] > 0 and crop_shape[2] > 0
+        self.crop_shape = crop_shape
+        self.keep_foreground = keep_foreground
+
+    def random_sample_location(self, seg_map: np.ndarray) -> dict:
+        """sample foreground voxel when keep_foreground is True.
+
+        Args:
+            seg_map (np.ndarray): gt seg map.
+
+        Returns:
+            dict: Coordinates of selected foreground voxel.
+        """
+        num_samples = 10000
+        # at least 1% of the class voxels need to be selected,
+        # otherwise it may be too sparse
+        min_percent_coverage = 0.01
+        class_locs = {}
+        foreground_classes = []
+        all_classes = np.unique(seg_map)
+        for c in all_classes:
+            if c == 0:
+                # to avoid the segmentation mask full of background 0
+                # and the class_locs is just void dictionary {} when it return
+                # there add a void list for background 0.
+                class_locs[c] = []
+            else:
+                all_locs = np.argwhere(seg_map == c)
+                target_num_samples = min(num_samples, len(all_locs))
+                target_num_samples = max(
+                    target_num_samples,
+                    int(np.ceil(len(all_locs) * min_percent_coverage)))
+
+                selected = all_locs[np.random.choice(
+                    len(all_locs), target_num_samples, replace=False)]
+                class_locs[c] = selected
+                foreground_classes.append(c)
+
+        selected_voxel = None
+        if len(foreground_classes) > 0:
+            selected_class = np.random.choice(foreground_classes)
+            voxels_of_that_class = class_locs[selected_class]
+            selected_voxel = voxels_of_that_class[np.random.choice(
+                len(voxels_of_that_class))]
+
+        return selected_voxel
+
+    def random_generate_crop_bbox(self, margin_z: int, margin_y: int,
+                                  margin_x: int) -> tuple:
+        """Randomly get a crop bounding box.
+
+        Args:
+            seg_map (np.ndarray): Ground truth segmentation map.
+
+        Returns:
+            tuple: Coordinates of the cropped image.
+        """
+        offset_z = np.random.randint(0, margin_z + 1)
+        offset_y = np.random.randint(0, margin_y + 1)
+        offset_x = np.random.randint(0, margin_x + 1)
+        crop_z1, crop_z2 = offset_z, offset_z + self.crop_shape[0]
+        crop_y1, crop_y2 = offset_y, offset_y + self.crop_shape[1]
+        crop_x1, crop_x2 = offset_x, offset_x + self.crop_shape[2]
+
+        return crop_z1, crop_z2, crop_y1, crop_y2, crop_x1, crop_x2
+
+    def generate_margin(self, results: dict) -> tuple:
+        """Generate margin of crop bounding-box.
+
+        If keep_foreground is True, it will sample a voxel of foreground
+        classes randomly, and will take it as the center of the bounding-box,
+        and return the margin between of the bounding-box and image.
+        If keep_foreground is False, it will return the difference from crop
+        shape and image shape.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            tuple: The margin for 3 dimensions of crop bounding-box and image.
+        """
+
+        seg_map = results['gt_seg_map']
+        if self.keep_foreground:
+            selected_voxel = self.random_sample_location(seg_map)
+            if selected_voxel is None:
+                # this only happens if some image does not contain
+                # foreground voxels at all
+                warnings.warn(f'case does not contain any foreground classes'
+                              f': {results["img_path"]}')
+                margin_z = max(seg_map.shape[0] - self.crop_shape[0], 0)
+                margin_y = max(seg_map.shape[1] - self.crop_shape[1], 0)
+                margin_x = max(seg_map.shape[2] - self.crop_shape[2], 0)
+            else:
+                margin_z = max(0, selected_voxel[0] - self.crop_shape[0] // 2)
+                margin_y = max(0, selected_voxel[1] - self.crop_shape[1] // 2)
+                margin_x = max(0, selected_voxel[2] - self.crop_shape[2] // 2)
+                margin_z = max(
+                    0, min(seg_map.shape[0] - self.crop_shape[0], margin_z))
+                margin_y = max(
+                    0, min(seg_map.shape[1] - self.crop_shape[1], margin_y))
+                margin_x = max(
+                    0, min(seg_map.shape[2] - self.crop_shape[2], margin_x))
+        else:
+            margin_z = max(seg_map.shape[0] - self.crop_shape[0], 0)
+            margin_y = max(seg_map.shape[1] - self.crop_shape[1], 0)
+            margin_x = max(seg_map.shape[2] - self.crop_shape[2], 0)
+
+        return margin_z, margin_y, margin_x
+
+    def crop(self, img: np.ndarray, crop_bbox: tuple) -> np.ndarray:
+        """Crop from ``img``
+
+        Args:
+            img (np.ndarray): Original input image.
+            crop_bbox (tuple): Coordinates of the cropped image.
+
+        Returns:
+            np.ndarray: The cropped image.
+        """
+        crop_z1, crop_z2, crop_y1, crop_y2, crop_x1, crop_x2 = crop_bbox
+        if len(img.shape) == 3:
+            # crop seg map
+            img = img[crop_z1:crop_z2, crop_y1:crop_y2, crop_x1:crop_x2]
+        else:
+            # crop image
+            assert len(img.shape) == 4
+            img = img[:, crop_z1:crop_z2, crop_y1:crop_y2, crop_x1:crop_x2]
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Transform function to randomly crop images, semantic segmentation
+        maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Randomly cropped results, 'img_shape' key in result dict is
+                updated according to crop size.
+        """
+        margin = self.generate_margin(results)
+        crop_bbox = self.random_generate_crop_bbox(*margin)
+
+        # crop the image
+        img = results['img']
+        results['img'] = self.crop(img, crop_bbox)
+        results['img_shape'] = results['img'].shape[1:]
+
+        # crop semantic seg
+        seg_map = results['gt_seg_map']
+        results['gt_seg_map'] = self.crop(seg_map, crop_bbox)
+
+        return results
+
+    def __repr__(self):
+        return self.__class__.__name__ + f'(crop_shape={self.crop_shape})'
+
+
+@TRANSFORMS.register_module()
+class BioMedicalGaussianNoise(BaseTransform):
+    """Add random Gaussian noise to image.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/transforms/noise_transforms.py#L53  # noqa:E501
+
+    Copyright (c) German Cancer Research Center (DKFZ)
+    Licensed under the Apache License, Version 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+            N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        prob (float): Probability to add Gaussian noise for
+            each sample. Default to 0.1.
+        mean (float): Mean or “centre” of the distribution. Default to 0.0.
+        std (float): Standard deviation of distribution. Default to 0.1.
+    """
+
+    def __init__(self,
+                 prob: float = 0.1,
+                 mean: float = 0.0,
+                 std: float = 0.1) -> None:
+        super().__init__()
+        assert 0.0 <= prob <= 1.0 and std >= 0.0
+        self.prob = prob
+        self.mean = mean
+        self.std = std
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to add random Gaussian noise to image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with random Gaussian noise.
+        """
+        if np.random.rand() < self.prob:
+            rand_std = np.random.uniform(0, self.std)
+            noise = np.random.normal(
+                self.mean, rand_std, size=results['img'].shape)
+            # noise is float64 array, convert to the results['img'].dtype
+            noise = noise.astype(results['img'].dtype)
+            results['img'] = results['img'] + noise
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'mean={self.mean}, '
+        repr_str += f'std={self.std})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedicalGaussianBlur(BaseTransform):
+    """Add Gaussian blur with random sigma to image.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/transforms/noise_transforms.py#L81 # noqa:E501
+
+    Copyright (c) German Cancer Research Center (DKFZ)
+    Licensed under the Apache License, Version 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+            N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+
+    - img
+
+    Args:
+        sigma_range (Tuple[float, float]|float): range to randomly
+            select sigma value. Default to (0.5, 1.0).
+        prob (float): Probability to apply Gaussian blur
+            for each sample. Default to 0.2.
+        prob_per_channel  (float): Probability to apply Gaussian blur
+            for each channel (axis N of the image). Default to 0.5.
+        different_sigma_per_channel (bool): whether to use different
+            sigma for each channel (axis N of the image). Default to True.
+        different_sigma_per_axis (bool): whether to use different
+            sigma for axis Z, X and Y of the image. Default to True.
+    """
+
+    def __init__(self,
+                 sigma_range: Tuple[float, float] = (0.5, 1.0),
+                 prob: float = 0.2,
+                 prob_per_channel: float = 0.5,
+                 different_sigma_per_channel: bool = True,
+                 different_sigma_per_axis: bool = True) -> None:
+        super().__init__()
+        assert 0.0 <= prob <= 1.0
+        assert 0.0 <= prob_per_channel <= 1.0
+        assert isinstance(sigma_range, Sequence) and len(sigma_range) == 2
+        self.sigma_range = sigma_range
+        self.prob = prob
+        self.prob_per_channel = prob_per_channel
+        self.different_sigma_per_channel = different_sigma_per_channel
+        self.different_sigma_per_axis = different_sigma_per_axis
+
+    def _get_valid_sigma(self, value_range) -> Tuple[float, ...]:
+        """Ensure the `value_range` to be either a single value or a sequence
+        of two values. If the `value_range` is a sequence, generate a random
+        value with `[value_range[0], value_range[1]]` based on uniform
+        sampling.
+
+        Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/7651ece69faf55263dd582a9f5cbd149ed9c3ad0/batchgenerators/augmentations/utils.py#L625 # noqa:E501
+
+        Args:
+            value_range (tuple|list|float|int): the input value range
+        """
+        if (isinstance(value_range, (list, tuple))):
+            if (value_range[0] == value_range[1]):
+                value = value_range[0]
+            else:
+                orig_type = type(value_range[0])
+                value = np.random.uniform(value_range[0], value_range[1])
+                value = orig_type(value)
+        return value
+
+    def _gaussian_blur(self, data_sample: np.ndarray) -> np.ndarray:
+        """Random generate sigma and apply Gaussian Blur to the data
+        Args:
+            data_sample (np.ndarray): data sample with multiple modalities,
+                the data shape is (N, Z, Y, X)
+        """
+        sigma = None
+        for c in range(data_sample.shape[0]):
+            if np.random.rand() < self.prob_per_channel:
+                # if no `sigma` is generated, generate one
+                # if `self.different_sigma_per_channel` is True,
+                # re-generate random sigma for each channel
+                if (sigma is None or self.different_sigma_per_channel):
+                    if (not self.different_sigma_per_axis):
+                        sigma = self._get_valid_sigma(self.sigma_range)
+                    else:
+                        sigma = [
+                            self._get_valid_sigma(self.sigma_range)
+                            for _ in data_sample.shape[1:]
+                        ]
+                # apply gaussian filter with `sigma`
+                data_sample[c] = gaussian_filter(
+                    data_sample[c], sigma, order=0)
+        return data_sample
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to add random Gaussian blur to image.
+
+        Args:
+            results (dict): Result dict.
+
+        Returns:
+            dict: Result dict with random Gaussian noise.
+        """
+        if np.random.rand() < self.prob:
+            results['img'] = self._gaussian_blur(results['img'])
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'prob_per_channel={self.prob_per_channel}, '
+        repr_str += f'sigma_range={self.sigma_range}, '
+        repr_str += 'different_sigma_per_channel=' \
+                    f'{self.different_sigma_per_channel}, '
+        repr_str += 'different_sigma_per_axis=' \
+                    f'{self.different_sigma_per_axis})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedicalRandomGamma(BaseTransform):
+    """Using random gamma correction to process the biomedical image.
+
+    Modified from
+    https://github.com/MIC-DKFZ/batchgenerators/blob/master/batchgenerators/transforms/color_transforms.py#L132 # noqa:E501
+    With licence: Apache 2.0
+
+    Required Keys:
+
+    - img (np.ndarray): Biomedical image with shape (N, Z, Y, X),
+        N is the number of modalities, and data type is float32.
+
+    Modified Keys:
+    - img
+
+    Args:
+        prob (float): The probability to perform this transform. Default: 0.5.
+        gamma_range (Tuple[float]): Range of gamma values. Default: (0.5, 2).
+        invert_image (bool): Whether invert the image before applying gamma
+            augmentation. Default: False.
+        per_channel (bool): Whether perform the transform each channel
+            individually. Default: False
+        retain_stats (bool): Gamma transformation will alter the mean and std
+            of the data in the patch. If retain_stats=True, the data will be
+            transformed to match the mean and standard deviation before gamma
+            augmentation. Default: False.
+    """
+
+    def __init__(self,
+                 prob: float = 0.5,
+                 gamma_range: Tuple[float] = (0.5, 2),
+                 invert_image: bool = False,
+                 per_channel: bool = False,
+                 retain_stats: bool = False):
+        assert 0 <= prob and prob <= 1
+        assert isinstance(gamma_range, tuple) and len(gamma_range) == 2
+        assert isinstance(invert_image, bool)
+        assert isinstance(per_channel, bool)
+        assert isinstance(retain_stats, bool)
+        self.prob = prob
+        self.gamma_range = gamma_range
+        self.invert_image = invert_image
+        self.per_channel = per_channel
+        self.retain_stats = retain_stats
+
+    @cache_randomness
+    def _do_gamma(self):
+        """Whether do adjust gamma for image."""
+        return np.random.rand() < self.prob
+
+    def _adjust_gamma(self, img: np.array):
+        """Gamma adjustment for image.
+
+        Args:
+            img (np.array): Input image before gamma adjust.
+
+        Returns:
+            np.arrays: Image after gamma adjust.
+        """
+
+        if self.invert_image:
+            img = -img
+
+        def _do_adjust(img):
+            if retain_stats_here:
+                img_mean = img.mean()
+                img_std = img.std()
+            if np.random.random() < 0.5 and self.gamma_range[0] < 1:
+                gamma = np.random.uniform(self.gamma_range[0], 1)
+            else:
+                gamma = np.random.uniform(
+                    max(self.gamma_range[0], 1), self.gamma_range[1])
+            img_min = img.min()
+            img_range = img.max() - img_min  # range
+            img = np.power(((img - img_min) / float(img_range + 1e-7)),
+                           gamma) * img_range + img_min
+            if retain_stats_here:
+                img = img - img.mean()
+                img = img / (img.std() + 1e-8) * img_std
+                img = img + img_mean
+            return img
+
+        if not self.per_channel:
+            retain_stats_here = self.retain_stats
+            img = _do_adjust(img)
+        else:
+            for c in range(img.shape[0]):
+                img[c] = _do_adjust(img[c])
+        if self.invert_image:
+            img = -img
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Call function to perform random gamma correction
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Result dict with random gamma correction performed.
+        """
+        do_gamma = self._do_gamma()
+
+        if do_gamma:
+            results['img'] = self._adjust_gamma(results['img'])
+        else:
+            pass
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, '
+        repr_str += f'gamma_range={self.gamma_range},'
+        repr_str += f'invert_image={self.invert_image},'
+        repr_str += f'per_channel={self.per_channel},'
+        repr_str += f'retain_stats={self.retain_stats}'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DPad(BaseTransform):
+    """Pad the biomedical 3d image & biomedical 3d semantic segmentation maps.
+
+    Required Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Modified Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Added Keys:
+
+    - pad_shape (Tuple[int, int, int]): The padded shape.
+
+    Args:
+        pad_shape (Tuple[int, int, int]): Fixed padding size.
+            Expected padding shape (Z, Y, X).
+        pad_val (float): Padding value for biomedical image.
+            The padding mode is set to "constant". The value
+            to be filled in padding area. Default: 0.
+        seg_pad_val (int): Padding value for biomedical 3d semantic
+            segmentation maps. The padding mode is set to "constant".
+            The value to be filled in padding area. Default: 0.
+    """
+
+    def __init__(self,
+                 pad_shape: Tuple[int, int, int],
+                 pad_val: float = 0.,
+                 seg_pad_val: int = 0) -> None:
+
+        # check pad_shape
+        assert pad_shape is not None
+        if not isinstance(pad_shape, tuple):
+            assert len(pad_shape) == 3
+
+        self.pad_shape = pad_shape
+        self.pad_val = pad_val
+        self.seg_pad_val = seg_pad_val
+
+    def _pad_img(self, results: dict) -> None:
+        """Pad images according to ``self.pad_shape``
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: The dict contains the padded image and shape
+                information.
+        """
+        padded_img = self._to_pad(
+            results['img'], pad_shape=self.pad_shape, pad_val=self.pad_val)
+
+        results['img'] = padded_img
+        results['pad_shape'] = padded_img.shape[1:]
+
+    def _pad_seg(self, results: dict) -> None:
+        """Pad semantic segmentation map according to ``self.pad_shape`` if
+        ``gt_seg_map`` is not None in results dict.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Update the padded gt seg map in dict.
+        """
+        if results.get('gt_seg_map', None) is not None:
+            pad_gt_seg = self._to_pad(
+                results['gt_seg_map'][None, ...],
+                pad_shape=results['pad_shape'],
+                pad_val=self.seg_pad_val)
+            results['gt_seg_map'] = pad_gt_seg[1:]
+
+    @staticmethod
+    def _to_pad(img: np.ndarray,
+                pad_shape: Tuple[int, int, int],
+                pad_val: Union[int, float] = 0) -> np.ndarray:
+        """Pad the given 3d image to a certain shape with specified padding
+        value.
+
+        Args:
+            img (ndarray): Biomedical image with shape (N, Z, Y, X)
+                to be padded. N is the number of modalities.
+            pad_shape (Tuple[int,int,int]): Expected padding shape (Z, Y, X).
+            pad_val (float, int): Values to be filled in padding areas
+                and the padding_mode is set to 'constant'. Default: 0.
+
+        Returns:
+            ndarray: The padded image.
+        """
+        # compute pad width
+        d = max(pad_shape[0] - img.shape[1], 0)
+        pad_d = (d // 2, d - d // 2)
+        h = max(pad_shape[1] - img.shape[2], 0)
+        pad_h = (h // 2, h - h // 2)
+        w = max(pad_shape[2] - img.shape[2], 0)
+        pad_w = (w // 2, w - w // 2)
+
+        pad_list = [(0, 0), pad_d, pad_h, pad_w]
+
+        img = np.pad(img, pad_list, mode='constant', constant_values=pad_val)
+        return img
+
+    def transform(self, results: dict) -> dict:
+        """Call function to pad images, semantic segmentation maps.
+
+        Args:
+            results (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Updated result dict.
+        """
+        self._pad_img(results)
+        self._pad_seg(results)
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'pad_shape={self.pad_shape}, '
+        repr_str += f'pad_val={self.pad_val}), '
+        repr_str += f'seg_pad_val={self.seg_pad_val})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class BioMedical3DRandomFlip(BaseTransform):
+    """Flip biomedical 3D images and segmentations.
+
+    Modified from https://github.com/MIC-DKFZ/batchgenerators/blob/master/batchgenerators/transforms/spatial_transforms.py # noqa:E501
+
+    Copyright 2021 Division of
+    Medical Image Computing, German Cancer Research Center (DKFZ) and Applied
+    Computer Vision Lab, Helmholtz Imaging Platform.
+    Licensed under the Apache-2.0 License.
+
+    Required Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Modified Keys:
+
+    - img (np.ndarry): Biomedical image with shape (N, Z, Y, X) by default,
+        N is the number of modalities.
+    - gt_seg_map (np.ndarray, optional): Biomedical seg map with shape
+        (Z, Y, X) by default.
+
+    Added Keys:
+
+    - do_flip
+    - flip_axes
+
+    Args:
+        prob (float): Flipping probability.
+        axes (Tuple[int, ...]): Flipping axes with order 'ZXY'.
+        swap_label_pairs (Optional[List[Tuple[int, int]]]):
+        The segmentation label pairs that are swapped when flipping.
+    """
+
+    def __init__(self,
+                 prob: float,
+                 axes: Tuple[int, ...],
+                 swap_label_pairs: Optional[List[Tuple[int, int]]] = None):
+        self.prob = prob
+        self.axes = axes
+        self.swap_label_pairs = swap_label_pairs
+        assert prob >= 0 and prob <= 1
+        if axes is not None:
+            assert max(axes) <= 2
+
+    @staticmethod
+    def _flip(img, direction: Tuple[bool, bool, bool]) -> np.ndarray:
+        if direction[0]:
+            img[:, :] = img[:, ::-1]
+        if direction[1]:
+            img[:, :, :] = img[:, :, ::-1]
+        if direction[2]:
+            img[:, :, :, :] = img[:, :, :, ::-1]
+        return img
+
+    def _do_flip(self, img: np.ndarray) -> Tuple[bool, bool, bool]:
+        """Call function to determine which axis to flip.
+
+        Args:
+            img (np.ndarry): Image or segmentation map array.
+        Returns:
+            tuple: Flip action, whether to flip on the z, x, and y axes.
+        """
+        flip_c, flip_x, flip_y = False, False, False
+        if self.axes is not None:
+            flip_c = 0 in self.axes and np.random.rand() < self.prob
+            flip_x = 1 in self.axes and np.random.rand() < self.prob
+            if len(img.shape) == 4:
+                flip_y = 2 in self.axes and np.random.rand() < self.prob
+        return flip_c, flip_x, flip_y
+
+    def _swap_label(self, seg: np.ndarray) -> np.ndarray:
+        out = seg.copy()
+        for first, second in self.swap_label_pairs:
+            first_area = (seg == first)
+            second_area = (seg == second)
+            out[first_area] = second
+            out[second_area] = first
+        return out
+
+    def transform(self, results: Dict) -> Dict:
+        """Call function to flip and swap pair labels.
+
+        Args:
+            results (dict): Result dict.
+        Returns:
+            dict: Flipped results, 'do_flip', 'flip_axes' keys are added into
+                result dict.
+        """
+        # get actual flipped axis
+        if 'do_flip' not in results:
+            results['do_flip'] = self._do_flip(results['img'])
+        if 'flip_axes' not in results:
+            results['flip_axes'] = self.axes
+        # flip image
+        results['img'] = self._flip(
+            results['img'], direction=results['do_flip'])
+        # flip seg
+        if results['gt_seg_map'] is not None:
+            if results['gt_seg_map'].shape != results['img'].shape:
+                results['gt_seg_map'] = results['gt_seg_map'][None, :]
+            results['gt_seg_map'] = self._flip(
+                results['gt_seg_map'], direction=results['do_flip'])
+            results['gt_seg_map'] = results['gt_seg_map'].squeeze()
+            # swap label pairs
+            if self.swap_label_pairs is not None:
+                results['gt_seg_map'] = self._swap_label(results['gt_seg_map'])
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(prob={self.prob}, axes={self.axes}, ' \
+                    f'swap_label_pairs={self.swap_label_pairs})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class Albu(BaseTransform):
+    """Albumentation augmentation. Adds custom transformations from
+    Albumentations library. Please, visit
+    `https://albumentations.readthedocs.io` to get more information. An example
+    of ``transforms`` is as followed:
+
+    .. code-block::
+        [
+            dict(
+                type='ShiftScaleRotate',
+                shift_limit=0.0625,
+                scale_limit=0.0,
+                rotate_limit=0,
+                interpolation=1,
+                p=0.5),
+            dict(
+                type='RandomBrightnessContrast',
+                brightness_limit=[0.1, 0.3],
+                contrast_limit=[0.1, 0.3],
+                p=0.2),
+            dict(type='ChannelShuffle', p=0.1),
+            dict(
+                type='OneOf',
+                transforms=[
+                    dict(type='Blur', blur_limit=3, p=1.0),
+                    dict(type='MedianBlur', blur_limit=3, p=1.0)
+                ],
+                p=0.1),
+        ]
+    Args:
+        transforms (list[dict]): A list of albu transformations
+        keymap (dict): Contains {'input key':'albumentation-style key'}
+        update_pad_shape (bool): Whether to update padding shape according to \
+            the output shape of the last transform
+    """
+
+    def __init__(self,
+                 transforms: List[dict],
+                 keymap: Optional[dict] = None,
+                 update_pad_shape: bool = False):
+        if not ALBU_INSTALLED:
+            raise ImportError(
+                'albumentations is not installed, '
+                'we suggest install albumentation by '
+                '"pip install albumentations>=0.3.2 --no-binary qudida,albumentations"'  # noqa
+            )
+
+        # Args will be modified later, copying it will be safer
+        transforms = copy.deepcopy(transforms)
+
+        self.transforms = transforms
+        self.keymap = keymap
+        self.update_pad_shape = update_pad_shape
+
+        self.aug = Compose([self.albu_builder(t) for t in self.transforms])
+
+        if not keymap:
+            self.keymap_to_albu = {'img': 'image', 'gt_seg_map': 'mask'}
+        else:
+            self.keymap_to_albu = copy.deepcopy(keymap)
+        self.keymap_back = {v: k for k, v in self.keymap_to_albu.items()}
+
+    def albu_builder(self, cfg: dict) -> object:
+        """Build a callable object from a dict containing albu arguments.
+
+        Args:
+            cfg (dict): Config dict. It should at least contain the key "type".
+
+        Returns:
+            Callable: A callable object.
+        """
+
+        assert isinstance(cfg, dict) and 'type' in cfg
+        args = cfg.copy()
+
+        obj_type = args.pop('type')
+        if mmengine.is_str(obj_type):
+            if not ALBU_INSTALLED:
+                raise ImportError(
+                    'albumentations is not installed, '
+                    'we suggest install albumentation by '
+                    '"pip install albumentations>=0.3.2 --no-binary qudida,albumentations"'  # noqa
+                )
+            obj_cls = getattr(albumentations, obj_type)
+        elif inspect.isclass(obj_type):
+            obj_cls = obj_type
+        else:
+            raise TypeError(
+                f'type must be a valid type or str, but got {type(obj_type)}')
+
+        if 'transforms' in args:
+            args['transforms'] = [
+                self.albu_builder(t) for t in args['transforms']
+            ]
+
+        return obj_cls(**args)
+
+    @staticmethod
+    def mapper(d: dict, keymap: dict):
+        """Dictionary mapper.
+
+        Renames keys according to keymap provided.
+        Args:
+            d (dict): old dict
+            keymap (dict): {'old_key':'new_key'}
+        Returns:
+            dict: new dict.
+        """
+
+        updated_dict = {}
+        for k, _ in zip(d.keys(), d.values()):
+            new_k = keymap.get(k, k)
+            updated_dict[new_k] = d[k]
+        return updated_dict
+
+    def transform(self, results):
+        # dict to albumentations format
+        results = self.mapper(results, self.keymap_to_albu)
+
+        # Convert to RGB since Albumentations works with RGB images
+        results['image'] = cv2.cvtColor(results['image'], cv2.COLOR_BGR2RGB)
+
+        results = self.aug(**results)
+
+        # Convert back to BGR
+        results['image'] = cv2.cvtColor(results['image'], cv2.COLOR_RGB2BGR)
+
+        # back to the original format
+        results = self.mapper(results, self.keymap_back)
+
+        # update final shape
+        if self.update_pad_shape:
+            results['pad_shape'] = results['img'].shape
+
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__ + f'(transforms={self.transforms})'
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class ConcatCDInput(BaseTransform):
+    """Concat images for change detection.
+
+    Required Keys:
+
+    - img
+    - img2
+
+    Args:
+        input_keys (tuple):  Input image keys for change detection.
+            Default: ('img', 'img2').
+    """
+
+    def __init__(self, input_keys=('img', 'img2')):
+        self.input_keys = input_keys
+
+    def transform(self, results: dict) -> dict:
+        img = []
+        for input_key in self.input_keys:
+            img.append(results.pop(input_key))
+        results['img'] = np.concatenate(img, axis=2)
+        return results
+
+    def __repr__(self):
+        repr_str = self.__class__.__name__
+        repr_str += f'(input_keys={self.input_keys}, '
+        return repr_str
+
+
+@TRANSFORMS.register_module()
+class RandomDepthMix(BaseTransform):
+    """This class implements the RandomDepthMix transform.
+
+    Args:
+        prob (float): Probability of applying the transformation.
+            Defaults to 0.25.
+        mix_scale_ratio (float): Ratio to scale the mix width.
+            Defaults to 0.75.
+    """
+
+    def __init__(
+        self,
+        prob: float = 0.25,
+        mix_scale_ratio: float = 0.75,
+    ):
+        super().__init__()
+
+        self.prob = prob
+        self.mix_scale_ratio = mix_scale_ratio
+
+    def transform(self, results: dict) -> dict:
+        if random.random() > self.prob:
+            return results
+
+        h, w = results['img_shape'][:2]
+        left = int(w * random.random())
+        width_ratio = self.mix_scale_ratio * random.random()
+        width = int(max(1, (w - left) * width_ratio))
+
+        img = results['img']
+        depth_rescale_factor = results.get('depth_rescale_factor', 1)
+        depth_map = results['gt_depth_map'] / depth_rescale_factor
+
+        if img.ndim == 3:
+            for c in range(img.shape[-1]):
+                img[:, left:left + width, c] = depth_map[:, left:left + width]
+        elif img.ndim == 2:
+            img[:, left:left + width] = depth_map[:, left:left + width]
+        else:
+            raise ValueError(f'Invalid image shape ({img.shape})')
+
+        results['img'] = img
+        return results
diff --git a/mmseg/datasets/voc.py b/mmseg/datasets/voc.py
index 4848d17b26..5e5d6025c0 100644
--- a/mmseg/datasets/voc.py
+++ b/mmseg/datasets/voc.py
@@ -1,12 +1,14 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 
+import mmengine.fileio as fileio
+
 from mmseg.registry import DATASETS
-from .custom import CustomDataset
+from .basesegdataset import BaseSegDataset
 
 
 @DATASETS.register_module()
-class PascalVOCDataset(CustomDataset):
+class PascalVOCDataset(BaseSegDataset):
     """Pascal VOC dataset.
 
     Args:
@@ -24,11 +26,15 @@ class PascalVOCDataset(CustomDataset):
                  [0, 64, 0], [128, 64, 0], [0, 192, 0], [128, 192, 0],
                  [0, 64, 128]])
 
-    def __init__(self, ann_file, **kwargs) -> None:
+    def __init__(self,
+                 ann_file,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
         super().__init__(
-            img_suffix='.jpg',
-            seg_map_suffix='.png',
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
             ann_file=ann_file,
             **kwargs)
-        assert self.file_client.exists(
-            self.data_prefix['img_path']) and osp.isfile(self.ann_file)
+        assert fileio.exists(self.data_prefix['img_path'],
+                             self.backend_args) and osp.isfile(self.ann_file)
diff --git a/mmseg/datasets/zero_mould_v1.py b/mmseg/datasets/zero_mould_v1.py
new file mode 100644
index 0000000000..2a738f1ac1
--- /dev/null
+++ b/mmseg/datasets/zero_mould_v1.py
@@ -0,0 +1,37 @@
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from mmseg.datasets.basesegdataset import BaseSegDataset
+
+@DATASETS.register_module()
+class ZeroMouldV1Dataset(BaseSegDataset):
+
+    METAINFO = dict(
+        classes=(
+            'background',
+            'correct-coloured',
+            'correct-uncoloured',
+            'wrong-uncoloured',
+            'idk'
+        ),
+        palette=[
+            [0, 0, 0],
+            [255, 0, 0],
+            [0, 200, 100],
+            [255, 225, 0],
+            [0, 0, 255]
+        ]
+    )
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.ome.tiff',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
+        assert fileio.exists(self.data_prefix['img_path'],
+                             self.backend_args)# and osp.isfile(self.ann_file)
diff --git a/mmseg/datasets/zero_mould_v2.py b/mmseg/datasets/zero_mould_v2.py
new file mode 100644
index 0000000000..d64298beb3
--- /dev/null
+++ b/mmseg/datasets/zero_mould_v2.py
@@ -0,0 +1,25 @@
+import os.path as osp
+
+import mmengine.fileio as fileio
+
+from mmseg.registry import DATASETS
+from mmseg.datasets.basesegdataset import BaseSegDataset
+
+@DATASETS.register_module()
+class ZeroMouldV2Dataset(BaseSegDataset):
+
+    METAINFO = dict(
+        classes=('background', 'correct-coloured'),
+        palette=[[0, 0, 0], [128, 128, 128]]
+    )
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.ome.tiff',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            **kwargs)
+        assert fileio.exists(self.data_prefix['img_path'],
+                             self.backend_args)# and osp.isfile(self.ann_file)
diff --git a/mmseg/engine/__init__.py b/mmseg/engine/__init__.py
new file mode 100644
index 0000000000..98139a0047
--- /dev/null
+++ b/mmseg/engine/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hooks import SegVisualizationHook
+from .optimizers import (ForceDefaultOptimWrapperConstructor,
+                         LayerDecayOptimizerConstructor,
+                         LearningRateDecayOptimizerConstructor)
+from .schedulers import PolyLRRatio
+
+__all__ = [
+    'LearningRateDecayOptimizerConstructor', 'LayerDecayOptimizerConstructor',
+    'SegVisualizationHook', 'PolyLRRatio',
+    'ForceDefaultOptimWrapperConstructor'
+]
diff --git a/mmseg/engine/hooks/__init__.py b/mmseg/engine/hooks/__init__.py
new file mode 100644
index 0000000000..c6048088a7
--- /dev/null
+++ b/mmseg/engine/hooks/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .visualization_hook import SegVisualizationHook
+
+__all__ = ['SegVisualizationHook']
diff --git a/mmseg/engine/hooks/visualization_hook.py b/mmseg/engine/hooks/visualization_hook.py
new file mode 100644
index 0000000000..ea238c6969
--- /dev/null
+++ b/mmseg/engine/hooks/visualization_hook.py
@@ -0,0 +1,97 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import warnings
+from typing import Optional, Sequence
+
+import mmcv
+import mmengine.fileio as fileio
+from mmengine.hooks import Hook
+from mmengine.runner import Runner
+from mmengine.visualization import Visualizer
+
+from mmseg.registry import HOOKS
+from mmseg.structures import SegDataSample
+
+
+@HOOKS.register_module()
+class SegVisualizationHook(Hook):
+    """Segmentation Visualization Hook. Used to visualize validation and
+    testing process prediction results.
+
+    In the testing phase:
+
+    1. If ``show`` is True, it means that only the prediction results are
+        visualized without storing data, so ``vis_backends`` needs to
+        be excluded.
+
+    Args:
+        draw (bool): whether to draw prediction results. If it is False,
+            it means that no drawing will be done. Defaults to False.
+        interval (int): The interval of visualization. Defaults to 50.
+        show (bool): Whether to display the drawn image. Default to False.
+        wait_time (float): The interval of show (s). Defaults to 0.
+        backend_args (dict, Optional): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to None.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
+    """
+
+    def __init__(self,
+                 draw: bool = False,
+                 interval: int = 50,
+                 show: bool = False,
+                 wait_time: float = 0.,
+                 backend_args: Optional[dict] = None):
+        self._visualizer: Visualizer = Visualizer.get_current_instance()
+        self.interval = interval
+        self.show = show
+        if self.show:
+            # No need to think about vis backends.
+            self._visualizer._vis_backends = {}
+            warnings.warn('The show is True, it means that only '
+                          'the prediction results are visualized '
+                          'without storing data, so vis_backends '
+                          'needs to be excluded.')
+
+        self.wait_time = wait_time
+        self.backend_args = backend_args.copy() if backend_args else None
+        self.draw = draw
+        if not self.draw:
+            warnings.warn('The draw is False, it means that the '
+                          'hook for visualization will not take '
+                          'effect. The results will NOT be '
+                          'visualized or stored.')
+
+    def _after_iter(self,
+                    runner: Runner,
+                    batch_idx: int,
+                    data_batch: dict,
+                    outputs: Sequence[SegDataSample],
+                    mode: str = 'val') -> None:
+        """Run after every ``self.interval`` validation iterations.
+
+        Args:
+            runner (:obj:`Runner`): The runner of the validation process.
+            batch_idx (int): The index of the current batch in the val loop.
+            data_batch (dict): Data from dataloader.
+            outputs (Sequence[:obj:`SegDataSample`]): Outputs from model.
+            mode (str): mode (str): Current mode of runner. Defaults to 'val'.
+        """
+        if self.draw is False or mode == 'train':
+            return
+
+        if self.every_n_inner_iters(batch_idx, self.interval):
+            for output in outputs:
+                img_path = output.img_path
+                img_bytes = fileio.get(
+                    img_path, backend_args=self.backend_args)
+                img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
+                window_name = f'{mode}_{osp.basename(img_path)}'
+
+                self._visualizer.add_datasample(
+                    window_name,
+                    img,
+                    data_sample=output,
+                    show=self.show,
+                    wait_time=self.wait_time,
+                    step=runner.iter)
diff --git a/mmseg/engine/optimizers/__init__.py b/mmseg/engine/optimizers/__init__.py
index 4fbf4ecfcd..e4cf58741f 100644
--- a/mmseg/engine/optimizers/__init__.py
+++ b/mmseg/engine/optimizers/__init__.py
@@ -1,7 +1,9 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .force_default_constructor import ForceDefaultOptimWrapperConstructor
 from .layer_decay_optimizer_constructor import (
     LayerDecayOptimizerConstructor, LearningRateDecayOptimizerConstructor)
 
 __all__ = [
-    'LearningRateDecayOptimizerConstructor', 'LayerDecayOptimizerConstructor'
+    'LearningRateDecayOptimizerConstructor', 'LayerDecayOptimizerConstructor',
+    'ForceDefaultOptimWrapperConstructor'
 ]
diff --git a/mmseg/engine/optimizers/force_default_constructor.py b/mmseg/engine/optimizers/force_default_constructor.py
new file mode 100644
index 0000000000..12c642ad41
--- /dev/null
+++ b/mmseg/engine/optimizers/force_default_constructor.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+from mmengine.logging import print_log
+from mmengine.optim import DefaultOptimWrapperConstructor
+from mmengine.utils.dl_utils import mmcv_full_available
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm, _InstanceNorm
+from torch.nn import GroupNorm, LayerNorm
+
+from mmseg.registry import OPTIM_WRAPPER_CONSTRUCTORS
+
+
+@OPTIM_WRAPPER_CONSTRUCTORS.register_module()
+class ForceDefaultOptimWrapperConstructor(DefaultOptimWrapperConstructor):
+    """Default constructor with forced optimizer settings.
+
+    This constructor extends the default constructor to add an option for
+    forcing default optimizer settings. This is useful for ensuring that
+    certain parameters or layers strictly adhere to pre-defined default
+    settings, regardless of any custom settings specified.
+
+    By default, each parameter share the same optimizer settings, and we
+    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
+    It is a dict and may contain various fields like 'custom_keys',
+    'bias_lr_mult', etc., as well as the additional field
+    `force_default_settings` which allows for enforcing default settings on
+    optimizer parameters.
+
+    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
+      one of the keys in ``custom_keys`` is a substring of the name of one
+      parameter, then the setting of the parameter will be specified by
+      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
+      be ignored. It should be noted that the aforementioned ``key`` is the
+      longest key that is a substring of the name of the parameter. If there
+      are multiple matched keys with the same length, then the key with lower
+      alphabet order will be chosen.
+      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
+      and ``decay_mult``. See Example 2 below.
+    - ``bias_lr_mult`` (float): It will be multiplied to the learning
+      rate for all bias parameters (except for those in normalization
+      layers and offset layers of DCN).
+    - ``bias_decay_mult`` (float): It will be multiplied to the weight
+      decay for all bias parameters (except for those in
+      normalization layers, depthwise conv layers, offset layers of DCN).
+    - ``norm_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of normalization
+      layers.
+    - ``flat_decay_mult`` (float): It will be multiplied to the weight
+      decay for all one-dimensional parameters
+    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
+      decay for all weight and bias parameters of depthwise conv
+      layers.
+    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
+      rate for parameters of offset layer in the deformable convs
+      of a model.
+    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
+      would not be added into optimizer. Defaults to False.
+    - ``force_default_settings`` (bool): If true, this will override any
+      custom settings defined by ``custom_keys`` and enforce the use of
+      default settings for optimizer parameters like ``bias_lr_mult``.
+      This is particularly useful when you want to ensure that certain layers
+      or parameters adhere strictly to the pre-defined default settings.
+
+    Note:
+
+        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        override the effect of ``bias_lr_mult`` in the bias of offset layer.
+        So be careful when using both ``bias_lr_mult`` and
+        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
+        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
+        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
+
+        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
+        apply it to all the DCN layers in the model. So be careful when the
+        model contains multiple DCN layers in places other than backbone.
+
+        3. When the option ``force_default_settings`` is true, it will override
+        any custom settings provided in ``custom_keys``. This ensures that the
+        default settings for the optimizer parameters are used.
+
+    Args:
+        optim_wrapper_cfg (dict): The config dict of the optimizer wrapper.
+
+            Required fields of ``optim_wrapper_cfg`` are
+
+            - ``type``: class name of the OptimizerWrapper
+            - ``optimizer``: The configuration of optimizer.
+
+            Optional fields of ``optim_wrapper_cfg`` are
+
+            - any arguments of the corresponding optimizer wrapper type,
+              e.g., accumulative_counts, clip_grad, etc.
+
+            Required fields of ``optimizer`` are
+
+            - `type`: class name of the optimizer.
+
+            Optional fields of ``optimizer`` are
+
+            - any arguments of the corresponding optimizer type, e.g.,
+              lr, weight_decay, momentum, etc.
+
+        paramwise_cfg (dict, optional): Parameter-wise options.
+
+    Example 1:
+        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
+        >>> optim_wrapper_cfg = dict(
+        >>>     dict(type='OptimWrapper', optimizer=dict(type='SGD', lr=0.01,
+        >>>         momentum=0.9, weight_decay=0.0001))
+        >>> paramwise_cfg = dict(norm_decay_mult=0.)
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+
+    Example 2:
+        >>> # assume model have attribute model.backbone and model.cls_head
+        >>> optim_wrapper_cfg = dict(type='OptimWrapper', optimizer=dict(
+        >>>     type='SGD', lr=0.01, weight_decay=0.95))
+        >>> paramwise_cfg = dict(custom_keys={
+        >>>     'backbone': dict(lr_mult=0.1, decay_mult=0.9)})
+        >>> optim_wrapper_builder = DefaultOptimWrapperConstructor(
+        >>>     optim_wrapper_cfg, paramwise_cfg)
+        >>> optim_wrapper = optim_wrapper_builder(model)
+        >>> # Then the `lr` and `weight_decay` for model.backbone is
+        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
+        >>> # model.cls_head is (0.01, 0.95).
+    """
+
+    def add_params(self,
+                   params: List[dict],
+                   module: nn.Module,
+                   prefix: str = '',
+                   is_dcn_module: Optional[Union[int, float]] = None) -> None:
+        """Add all parameters of module to the params list.
+
+        The parameters of the given module will be added to the list of param
+        groups, with specific rules defined by paramwise_cfg.
+
+        Args:
+            params (list[dict]): A list of param groups, it will be modified
+                in place.
+            module (nn.Module): The module to be added.
+            prefix (str): The prefix of the module
+            is_dcn_module (int|float|None): If the current module is a
+                submodule of DCN, `is_dcn_module` will be passed to
+                control conv_offset layer's learning rate. Defaults to None.
+        """
+        # get param-wise options
+        custom_keys = self.paramwise_cfg.get('custom_keys', {})
+        # first sort with alphabet order and then sort with reversed len of str
+        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
+
+        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', None)
+        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', None)
+        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', None)
+        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', None)
+        flat_decay_mult = self.paramwise_cfg.get('flat_decay_mult', None)
+        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
+        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', None)
+        force_default_settings = self.paramwise_cfg.get(
+            'force_default_settings', False)
+
+        # special rules for norm layers and depth-wise conv layers
+        is_norm = isinstance(module,
+                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
+        is_dwconv = (
+            isinstance(module, torch.nn.Conv2d)
+            and module.in_channels == module.groups)
+
+        for name, param in module.named_parameters(recurse=False):
+            param_group = {'params': [param]}
+            if bypass_duplicate and self._is_in(param_group, params):
+                print_log(
+                    f'{prefix} is duplicate. It is skipped since '
+                    f'bypass_duplicate={bypass_duplicate}',
+                    logger='current',
+                    level=logging.WARNING)
+                continue
+            if not param.requires_grad:
+                params.append(param_group)
+                continue
+
+            # if the parameter match one of the custom keys, ignore other rules
+            is_custom = False
+            for key in sorted_keys:
+                if key in f'{prefix}.{name}':
+                    is_custom = True
+                    lr_mult = custom_keys[key].get('lr_mult', 1.)
+                    param_group['lr'] = self.base_lr * lr_mult
+                    if self.base_wd is not None:
+                        decay_mult = custom_keys[key].get('decay_mult', 1.)
+                        param_group['weight_decay'] = self.base_wd * decay_mult
+                    # add custom settings to param_group
+                    for k, v in custom_keys[key].items():
+                        param_group[k] = v
+                    break
+
+            if not is_custom or force_default_settings:
+                # bias_lr_mult affects all bias parameters
+                # except for norm.bias dcn.conv_offset.bias
+                if name == 'bias' and not (
+                        is_norm or is_dcn_module) and bias_lr_mult is not None:
+                    param_group['lr'] = self.base_lr * bias_lr_mult
+
+                if (prefix.find('conv_offset') != -1 and is_dcn_module
+                        and dcn_offset_lr_mult is not None
+                        and isinstance(module, torch.nn.Conv2d)):
+                    # deal with both dcn_offset's bias & weight
+                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
+
+                # apply weight decay policies
+                if self.base_wd is not None:
+                    # norm decay
+                    if is_norm and norm_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * norm_decay_mult
+                    # bias lr and decay
+                    elif (name == 'bias' and not is_dcn_module
+                          and bias_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * bias_decay_mult
+                    # depth-wise conv
+                    elif is_dwconv and dwconv_decay_mult is not None:
+                        param_group[
+                            'weight_decay'] = self.base_wd * dwconv_decay_mult
+                    # flatten parameters except dcn offset
+                    elif (param.ndim == 1 and not is_dcn_module
+                          and flat_decay_mult is not None):
+                        param_group[
+                            'weight_decay'] = self.base_wd * flat_decay_mult
+            params.append(param_group)
+            for key, value in param_group.items():
+                if key == 'params':
+                    continue
+                full_name = f'{prefix}.{name}' if prefix else name
+                print_log(
+                    f'paramwise_options -- {full_name}:{key}={value}',
+                    logger='current')
+
+        if mmcv_full_available():
+            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
+            is_dcn_module = isinstance(module,
+                                       (DeformConv2d, ModulatedDeformConv2d))
+        else:
+            is_dcn_module = False
+        for child_name, child_mod in module.named_children():
+            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
+            self.add_params(
+                params,
+                child_mod,
+                prefix=child_prefix,
+                is_dcn_module=is_dcn_module)
diff --git a/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py b/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py
index e614ad4084..fdae3ca698 100644
--- a/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py
+++ b/mmseg/engine/optimizers/layer_decay_optimizer_constructor.py
@@ -204,5 +204,4 @@ def __init__(self, optim_wrapper_cfg, paramwise_cfg):
         warnings.warn('DeprecationWarning: Layer_decay_rate will '
                       'be deleted, please use decay_rate instead.')
         paramwise_cfg['decay_rate'] = paramwise_cfg.pop('layer_decay_rate')
-        super(LayerDecayOptimizerConstructor,
-              self).__init__(optim_wrapper_cfg, paramwise_cfg)
+        super().__init__(optim_wrapper_cfg, paramwise_cfg)
diff --git a/mmseg/engine/schedulers/__init__.py b/mmseg/engine/schedulers/__init__.py
new file mode 100644
index 0000000000..3cd3f62113
--- /dev/null
+++ b/mmseg/engine/schedulers/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .poly_ratio_scheduler import PolyLRRatio
+
+__all__ = ['PolyLRRatio']
diff --git a/mmseg/engine/schedulers/poly_ratio_scheduler.py b/mmseg/engine/schedulers/poly_ratio_scheduler.py
new file mode 100644
index 0000000000..057203acc9
--- /dev/null
+++ b/mmseg/engine/schedulers/poly_ratio_scheduler.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+from mmengine.optim.scheduler import PolyLR
+
+from mmseg.registry import PARAM_SCHEDULERS
+
+
+@PARAM_SCHEDULERS.register_module()
+class PolyLRRatio(PolyLR):
+    """Implements polynomial learning rate decay with ratio.
+
+    This scheduler adjusts the learning rate of each parameter group
+    following a polynomial decay equation. The decay can occur in
+    conjunction with external parameter adjustments made outside this
+    scheduler.
+
+    Args:
+        optimizer (Optimizer or OptimWrapper): Wrapped optimizer.
+        eta_min (float): Minimum learning rate at the end of scheduling.
+            Defaults to 0.
+        eta_min_ratio (float, optional): The ratio of the minimum parameter
+            value to the base parameter value. Either `eta_min` or
+            `eta_min_ratio` should be specified. Defaults to None.
+        power (float): The power of the polynomial. Defaults to 1.0.
+        begin (int): Step at which to start updating the parameters.
+            Defaults to 0.
+        end (int): Step at which to stop updating the parameters.
+            Defaults to INF.
+        last_step (int): The index of last step. Used for resume without
+            state dict. Defaults to -1.
+        by_epoch (bool): Whether the scheduled parameters are updated by
+            epochs. Defaults to True.
+        verbose (bool): Whether to print the value for each update.
+            Defaults to False.
+    """
+
+    def __init__(self, eta_min_ratio: Optional[int] = None, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.eta_min_ratio = eta_min_ratio
+
+    def _get_value(self):
+        """Compute value using chainable form of the scheduler."""
+
+        if self.last_step == 0:
+            return [
+                group[self.param_name] for group in self.optimizer.param_groups
+            ]
+
+        param_groups_value = []
+        for base_value, param_group in zip(self.base_values,
+                                           self.optimizer.param_groups):
+            eta_min = self.eta_min if self.eta_min_ratio is None else \
+                base_value * self.eta_min_ratio
+            step_ratio = (1 - 1 /
+                          (self.total_iters - self.last_step + 1))**self.power
+            step_value = (param_group[self.param_name] -
+                          eta_min) * step_ratio + eta_min
+            param_groups_value.append(step_value)
+
+        return param_groups_value
diff --git a/mmseg/evaluation/__init__.py b/mmseg/evaluation/__init__.py
new file mode 100644
index 0000000000..82b3a8d68d
--- /dev/null
+++ b/mmseg/evaluation/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .metrics import CityscapesMetric, DepthMetric, IoUMetric
+
+__all__ = ['IoUMetric', 'CityscapesMetric', 'DepthMetric']
diff --git a/mmseg/evaluation/metrics/__init__.py b/mmseg/evaluation/metrics/__init__.py
new file mode 100644
index 0000000000..848d4713dc
--- /dev/null
+++ b/mmseg/evaluation/metrics/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .citys_metric import CityscapesMetric
+from .depth_metric import DepthMetric
+from .iou_metric import IoUMetric
+
+__all__ = ['IoUMetric', 'CityscapesMetric', 'DepthMetric']
diff --git a/mmseg/evaluation/metrics/citys_metric.py b/mmseg/evaluation/metrics/citys_metric.py
new file mode 100644
index 0000000000..32984653c3
--- /dev/null
+++ b/mmseg/evaluation/metrics/citys_metric.py
@@ -0,0 +1,158 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import shutil
+from collections import OrderedDict
+from typing import Dict, Optional, Sequence
+
+try:
+
+    import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as CSEval  # noqa
+    import cityscapesscripts.helpers.labels as CSLabels
+except ImportError:
+    CSLabels = None
+    CSEval = None
+
+import numpy as np
+from mmengine.dist import is_main_process, master_only
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+
+from mmseg.registry import METRICS
+
+
+@METRICS.register_module()
+class CityscapesMetric(BaseMetric):
+    """Cityscapes evaluation metric.
+
+    Args:
+        output_dir (str): The directory for output prediction
+        ignore_index (int): Index that will be ignored in evaluation.
+            Default: 255.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to format the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        keep_results (bool): Whether to keep the results. When ``format_only``
+            is True, ``keep_results`` must be True. Defaults to False.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+
+    def __init__(self,
+                 output_dir: str,
+                 ignore_index: int = 255,
+                 format_only: bool = False,
+                 keep_results: bool = False,
+                 collect_device: str = 'cpu',
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+        if CSEval is None:
+            raise ImportError('Please run "pip install cityscapesscripts" to '
+                              'install cityscapesscripts first.')
+        self.output_dir = output_dir
+        self.ignore_index = ignore_index
+
+        self.format_only = format_only
+        if format_only:
+            assert keep_results, (
+                'When format_only is True, the results must be keep, please '
+                f'set keep_results as True, but got {keep_results}')
+        self.keep_results = keep_results
+        self.prefix = prefix
+        if is_main_process():
+            mkdir_or_exist(self.output_dir)
+
+    @master_only
+    def __del__(self) -> None:
+        """Clean up."""
+        if not self.keep_results:
+            shutil.rmtree(self.output_dir)
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to computed the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        mkdir_or_exist(self.output_dir)
+
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['data'][0].cpu().numpy()
+            # when evaluating with official cityscapesscripts,
+            # labelIds should be used
+            pred_label = self._convert_to_label_id(pred_label)
+            basename = osp.splitext(osp.basename(data_sample['img_path']))[0]
+            png_filename = osp.abspath(
+                osp.join(self.output_dir, f'{basename}.png'))
+            output = Image.fromarray(pred_label.astype(np.uint8)).convert('P')
+            output.save(png_filename)
+            if self.format_only:
+                # format_only always for test dataset without ground truth
+                gt_filename = ''
+            else:
+                # when evaluating with official cityscapesscripts,
+                # **_gtFine_labelIds.png is used
+                gt_filename = data_sample['seg_map_path'].replace(
+                    'labelTrainIds.png', 'labelIds.png')
+            self.results.append((png_filename, gt_filename))
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): Testing results of the dataset.
+
+        Returns:
+            dict[str: float]: Cityscapes evaluation results.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        msg = 'Evaluating in Cityscapes style'
+        if logger is None:
+            msg = '\n' + msg
+        print_log(msg, logger=logger)
+
+        eval_results = dict()
+        print_log(
+            f'Evaluating results under {self.output_dir} ...', logger=logger)
+
+        CSEval.args.evalInstLevelScore = True
+        CSEval.args.predictionPath = osp.abspath(self.output_dir)
+        CSEval.args.evalPixelAccuracy = True
+        CSEval.args.JSONOutput = False
+
+        pred_list, gt_list = zip(*results)
+        metric = dict()
+        eval_results.update(
+            CSEval.evaluateImgLists(pred_list, gt_list, CSEval.args))
+        metric['averageScoreCategories'] = eval_results[
+            'averageScoreCategories']
+        metric['averageScoreInstCategories'] = eval_results[
+            'averageScoreInstCategories']
+        return metric
+
+    @staticmethod
+    def _convert_to_label_id(result):
+        """Convert trainId to id for cityscapes."""
+        if isinstance(result, str):
+            result = np.load(result)
+        result_copy = result.copy()
+        for trainId, label in CSLabels.trainId2label.items():
+            result_copy[result == trainId] = label.id
+
+        return result_copy
diff --git a/mmseg/evaluation/metrics/depth_metric.py b/mmseg/evaluation/metrics/depth_metric.py
new file mode 100644
index 0000000000..621d4a31c9
--- /dev/null
+++ b/mmseg/evaluation/metrics/depth_metric.py
@@ -0,0 +1,212 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from collections import OrderedDict, defaultdict
+from typing import Dict, List, Optional, Sequence
+
+import cv2
+import numpy as np
+import torch
+from mmengine.dist import is_main_process
+from mmengine.evaluator import BaseMetric
+from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from prettytable import PrettyTable
+from torch import Tensor
+
+from mmseg.registry import METRICS
+
+
+@METRICS.register_module()
+class DepthMetric(BaseMetric):
+    """Depth estimation evaluation metric.
+
+    Args:
+        depth_metrics (List[str], optional): List of metrics to compute. If
+            not specified, defaults to all metrics in self.METRICS.
+        min_depth_eval (float): Minimum depth value for evaluation.
+            Defaults to 0.0.
+        max_depth_eval (float): Maximum depth value for evaluation.
+            Defaults to infinity.
+        crop_type (str, optional): Specifies the type of cropping to be used
+            during evaluation. This option can affect how the evaluation mask
+            is generated. Currently, 'nyu_crop' is supported, but other
+            types can be added in future. Defaults to None if no cropping
+            should be applied.
+        depth_scale_factor (float): Factor to scale the depth values.
+            Defaults to 1.0.
+        collect_device (str): Device name used for collecting results from
+            different ranks during distributed training. Must be 'cpu' or
+            'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
+        prefix (str, optional): The prefix that will be added in the metric
+            names to disambiguate homonymous metrics of different evaluators.
+            If prefix is not provided in the argument, self.default_prefix
+            will be used instead. Defaults to None.
+    """
+    METRICS = ('d1', 'd2', 'd3', 'abs_rel', 'sq_rel', 'rmse', 'rmse_log',
+               'log10', 'silog')
+
+    def __init__(self,
+                 depth_metrics: Optional[List[str]] = None,
+                 min_depth_eval: float = 0.0,
+                 max_depth_eval: float = float('inf'),
+                 crop_type: Optional[str] = None,
+                 depth_scale_factor: float = 1.0,
+                 collect_device: str = 'cpu',
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
+        super().__init__(collect_device=collect_device, prefix=prefix)
+
+        if depth_metrics is None:
+            self.metrics = self.METRICS
+        elif isinstance(depth_metrics, [tuple, list]):
+            for metric in depth_metrics:
+                assert metric in self.METRICS, f'the metric {metric} is not ' \
+                    f'supported. Please use metrics in {self.METRICS}'
+            self.metrics = depth_metrics
+
+        # Validate crop_type, if provided
+        assert crop_type in [
+            None, 'nyu_crop'
+        ], (f'Invalid value for crop_type: {crop_type}. Supported values are '
+            'None or \'nyu_crop\'.')
+        self.crop_type = crop_type
+        self.min_depth_eval = min_depth_eval
+        self.max_depth_eval = max_depth_eval
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
+        self.depth_scale_factor = depth_scale_factor
+
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
+
+        The processed results should be stored in ``self.results``, which will
+        be used to compute the metrics when all batches have been processed.
+
+        Args:
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
+        """
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_depth_map']['data'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                gt_depth = data_sample['gt_depth_map']['data'].squeeze().to(
+                    pred_label)
+
+                eval_mask = self._get_eval_mask(gt_depth)
+                self.results.append(
+                    (gt_depth[eval_mask], pred_label[eval_mask]))
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy(
+                ) * self.depth_scale_factor
+
+                cv2.imwrite(png_filename, output_mask.astype(np.uint16),
+                            [cv2.IMWRITE_PNG_COMPRESSION, 0])
+
+    def _get_eval_mask(self, gt_depth: Tensor):
+        """Generates an evaluation mask based on ground truth depth and
+        cropping.
+
+        Args:
+            gt_depth (Tensor): Ground truth depth map.
+
+        Returns:
+            Tensor: Boolean mask where evaluation should be performed.
+        """
+        valid_mask = torch.logical_and(gt_depth > self.min_depth_eval,
+                                       gt_depth < self.max_depth_eval)
+
+        if self.crop_type == 'nyu_crop':
+            # this implementation is adapted from
+            # https://github.com/zhyever/Monocular-Depth-Estimation-Toolbox/blob/main/depth/datasets/nyu.py  # noqa
+            crop_mask = torch.zeros_like(valid_mask)
+            crop_mask[45:471, 41:601] = 1
+        else:
+            crop_mask = torch.ones_like(valid_mask)
+
+        eval_mask = torch.logical_and(valid_mask, crop_mask)
+        return eval_mask
+
+    @staticmethod
+    def _calc_all_metrics(gt_depth, pred_depth):
+        """Computes final evaluation metrics based on accumulated results."""
+        assert gt_depth.shape == pred_depth.shape
+
+        thresh = torch.max((gt_depth / pred_depth), (pred_depth / gt_depth))
+        diff = pred_depth - gt_depth
+        diff_log = torch.log(pred_depth) - torch.log(gt_depth)
+
+        d1 = torch.sum(thresh < 1.25).float() / len(thresh)
+        d2 = torch.sum(thresh < 1.25**2).float() / len(thresh)
+        d3 = torch.sum(thresh < 1.25**3).float() / len(thresh)
+
+        abs_rel = torch.mean(torch.abs(diff) / gt_depth)
+        sq_rel = torch.mean(torch.pow(diff, 2) / gt_depth)
+
+        rmse = torch.sqrt(torch.mean(torch.pow(diff, 2)))
+        rmse_log = torch.sqrt(torch.mean(torch.pow(diff_log, 2)))
+
+        log10 = torch.mean(
+            torch.abs(torch.log10(pred_depth) - torch.log10(gt_depth)))
+        silog = torch.sqrt(
+            torch.pow(diff_log, 2).mean() -
+            0.5 * torch.pow(diff_log.mean(), 2))
+
+        return {
+            'd1': d1.item(),
+            'd2': d2.item(),
+            'd3': d3.item(),
+            'abs_rel': abs_rel.item(),
+            'sq_rel': sq_rel.item(),
+            'rmse': rmse.item(),
+            'rmse_log': rmse_log.item(),
+            'log10': log10.item(),
+            'silog': silog.item()
+        }
+
+    def compute_metrics(self, results: list) -> Dict[str, float]:
+        """Compute the metrics from processed results.
+
+        Args:
+            results (list): The processed results of each batch.
+
+        Returns:
+            Dict[str, float]: The computed metrics. The keys are the names of
+                the metrics, and the values are corresponding results. The keys
+                are identical with self.metrics.
+        """
+        logger: MMLogger = MMLogger.get_current_instance()
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
+
+        metrics = defaultdict(list)
+        for gt_depth, pred_depth in results:
+            for key, value in self._calc_all_metrics(gt_depth,
+                                                     pred_depth).items():
+                metrics[key].append(value)
+        metrics = {k: sum(metrics[k]) / len(metrics[k]) for k in self.metrics}
+
+        table_data = PrettyTable()
+        for key, val in metrics.items():
+            table_data.add_column(key, [round(val, 5)])
+
+        print_log('results:', logger)
+        print_log('\n' + table_data.get_string(), logger=logger)
+
+        return metrics
diff --git a/mmseg/metrics/iou_metric.py b/mmseg/evaluation/metrics/iou_metric.py
similarity index 78%
rename from mmseg/metrics/iou_metric.py
rename to mmseg/evaluation/metrics/iou_metric.py
index c5bf28d6fb..16014c7400 100644
--- a/mmseg/metrics/iou_metric.py
+++ b/mmseg/evaluation/metrics/iou_metric.py
@@ -1,11 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
 from collections import OrderedDict
 from typing import Dict, List, Optional, Sequence
 
 import numpy as np
 import torch
+from mmengine.dist import is_main_process
 from mmengine.evaluator import BaseMetric
 from mmengine.logging import MMLogger, print_log
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
 from prettytable import PrettyTable
 
 from mmseg.registry import METRICS
@@ -27,6 +31,12 @@ class IoUMetric(BaseMetric):
         collect_device (str): Device name used for collecting results from
             different ranks during distributed training. Must be 'cpu' or
             'gpu'. Defaults to 'cpu'.
+        output_dir (str): The directory for output prediction. Defaults to
+            None.
+        format_only (bool): Only format result for results commit without
+            perform evaluation. It is useful when you want to save the result
+            to a specific format and submit it to the test server.
+            Defaults to False.
         prefix (str, optional): The prefix that will be added in the metric
             names to disambiguate homonymous metrics of different evaluators.
             If prefix is not provided in the argument, self.default_prefix
@@ -39,33 +49,55 @@ def __init__(self,
                  nan_to_num: Optional[int] = None,
                  beta: int = 1,
                  collect_device: str = 'cpu',
-                 prefix: Optional[str] = None) -> None:
+                 output_dir: Optional[str] = None,
+                 format_only: bool = False,
+                 prefix: Optional[str] = None,
+                 **kwargs) -> None:
         super().__init__(collect_device=collect_device, prefix=prefix)
 
         self.ignore_index = ignore_index
         self.metrics = iou_metrics
         self.nan_to_num = nan_to_num
         self.beta = beta
+        self.output_dir = output_dir
+        if self.output_dir and is_main_process():
+            mkdir_or_exist(self.output_dir)
+        self.format_only = format_only
 
-    def process(self, data_batch: Sequence[dict],
-                predictions: Sequence[dict]) -> None:
-        """Process one batch of data and predictions.
+    def process(self, data_batch: dict, data_samples: Sequence[dict]) -> None:
+        """Process one batch of data and data_samples.
 
         The processed results should be stored in ``self.results``, which will
-        be used to computed the metrics when all batches have been processed.
+        be used to compute the metrics when all batches have been processed.
 
         Args:
-            data_batch (Sequence[dict]): A batch of data from the dataloader.
-            predictions (Sequence[dict]): A batch of outputs from the model.
+            data_batch (dict): A batch of data from the dataloader.
+            data_samples (Sequence[dict]): A batch of outputs from the model.
         """
         num_classes = len(self.dataset_meta['classes'])
-        for data, pred in zip(data_batch, predictions):
-            pred_label = pred['pred_sem_seg']['data'].squeeze()
-            label = data['data_sample']['gt_sem_seg']['data'].squeeze().to(
-                pred_label)
-            self.results.append(
-                self.intersect_and_union(pred_label, label, num_classes,
-                                         self.ignore_index))
+        for data_sample in data_samples:
+            pred_label = data_sample['pred_sem_seg']['data'].squeeze()
+            # format_only always for test dataset without ground truth
+            if not self.format_only:
+                label = data_sample['gt_sem_seg']['data'].squeeze().to(
+                    pred_label)
+                self.results.append(
+                    self.intersect_and_union(pred_label, label, num_classes,
+                                             self.ignore_index))
+            # format_result
+            if self.output_dir is not None:
+                basename = osp.splitext(osp.basename(
+                    data_sample['img_path']))[0]
+                png_filename = osp.abspath(
+                    osp.join(self.output_dir, f'{basename}.png'))
+                output_mask = pred_label.cpu().numpy()
+                # The index range of official ADE20k dataset is from 0 to 150.
+                # But the index range of output is from 0 to 149.
+                # That is because we set reduce_zero_label=True.
+                if data_sample.get('reduce_zero_label', False):
+                    output_mask = output_mask + 1
+                output = Image.fromarray(output_mask.astype(np.uint8))
+                output.save(png_filename)
 
     def compute_metrics(self, results: list) -> Dict[str, float]:
         """Compute the metrics from processed results.
@@ -80,7 +112,9 @@ def compute_metrics(self, results: list) -> Dict[str, float]:
                 mRecall.
         """
         logger: MMLogger = MMLogger.get_current_instance()
-
+        if self.format_only:
+            logger.info(f'results are saved to {osp.dirname(self.output_dir)}')
+            return OrderedDict()
         # convert list of tuples to tuple of lists, e.g.
         # [(A_1, B_1, C_1, D_1), ...,  (A_n, B_n, C_n, D_n)] to
         # ([A_1, ..., A_n], ..., [D_1, ..., D_n])
@@ -214,7 +248,7 @@ def f_score(precision, recall, beta=1):
             metrics = [metrics]
         allowed_metrics = ['mIoU', 'mDice', 'mFscore']
         if not set(metrics).issubset(set(allowed_metrics)):
-            raise KeyError('metrics {} is not supported'.format(metrics))
+            raise KeyError(f'metrics {metrics} is not supported')
 
         all_acc = total_area_intersect.sum() / total_area_label.sum()
         ret_metrics = OrderedDict({'aAcc': all_acc})
diff --git a/mmseg/metrics/__init__.py b/mmseg/metrics/__init__.py
deleted file mode 100644
index aec08bb071..0000000000
--- a/mmseg/metrics/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .citys_metric import CitysMetric
-from .iou_metric import IoUMetric
-
-__all__ = ['IoUMetric', 'CitysMetric']
diff --git a/mmseg/metrics/citys_metric.py b/mmseg/metrics/citys_metric.py
deleted file mode 100644
index 73516e7786..0000000000
--- a/mmseg/metrics/citys_metric.py
+++ /dev/null
@@ -1,147 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-from typing import Dict, List, Optional, Sequence
-
-import mmcv
-import numpy as np
-from mmengine.evaluator import BaseMetric
-from mmengine.logging import MMLogger, print_log
-from PIL import Image
-
-from mmseg.registry import METRICS
-
-
-@METRICS.register_module()
-class CitysMetric(BaseMetric):
-    """Cityscapes evaluation metric.
-
-    Args:
-        ignore_index (int): Index that will be ignored in evaluation.
-            Default: 255.
-        citys_metrics (list[str] | str): Metrics to be evaluated,
-            Default: ['cityscapes'].
-        to_label_id (bool): whether convert output to label_id for
-            submission. Default: True.
-        suffix (str): The filename prefix of the png files.
-            If the prefix is "somepath/xxx", the png files will be
-            named "somepath/xxx.png". Default: '.format_cityscapes'.
-        collect_device (str): Device name used for collecting results from
-            different ranks during distributed training. Must be 'cpu' or
-            'gpu'. Defaults to 'cpu'.
-        prefix (str, optional): The prefix that will be added in the metric
-            names to disambiguate homonymous metrics of different evaluators.
-            If prefix is not provided in the argument, self.default_prefix
-            will be used instead. Defaults to None.
-    """
-
-    def __init__(self,
-                 ignore_index: int = 255,
-                 citys_metrics: List[str] = ['cityscapes'],
-                 to_label_id: bool = True,
-                 suffix: str = '.format_cityscapes',
-                 collect_device: str = 'cpu',
-                 prefix: Optional[str] = None) -> None:
-        super().__init__(collect_device=collect_device, prefix=prefix)
-
-        self.ignore_index = ignore_index
-        self.metrics = citys_metrics
-        assert self.metrics[0] == 'cityscapes'
-        self.to_label_id = to_label_id
-        self.suffix = suffix
-
-    def process(self, data_batch: Sequence[dict],
-                predictions: Sequence[dict]) -> None:
-        """Process one batch of data and predictions.
-
-        The processed results should be stored in ``self.results``, which will
-        be used to computed the metrics when all batches have been processed.
-
-        Args:
-            data_batch (Sequence[dict]): A batch of data from the dataloader.
-            predictions (Sequence[dict]): A batch of outputs from the model.
-        """
-        mmcv.mkdir_or_exist(self.suffix)
-
-        for pred in predictions:
-            pred_label = pred['pred_sem_seg']['data'][0].cpu().numpy()
-            # results2img
-            if self.to_label_id:
-                pred_label = self._convert_to_label_id(pred_label)
-            basename = osp.splitext(osp.basename(pred['img_path']))[0]
-            png_filename = osp.join(self.suffix, f'{basename}.png')
-            output = Image.fromarray(pred_label.astype(np.uint8)).convert('P')
-            import cityscapesscripts.helpers.labels as CSLabels
-            palette = np.zeros((len(CSLabels.id2label), 3), dtype=np.uint8)
-            for label_id, label in CSLabels.id2label.items():
-                palette[label_id] = label.color
-            output.putpalette(palette)
-            output.save(png_filename)
-
-        ann_dir = osp.join(
-            data_batch[0]['data_sample']['seg_map_path'].split('val')[0],
-            'val')
-        self.results.append(ann_dir)
-
-    def compute_metrics(self, results: list) -> Dict[str, float]:
-        """Compute the metrics from processed results.
-
-        Args:
-            results (list): Testing results of the dataset.
-            logger (logging.Logger | str | None): Logger used for printing
-                related information during evaluation. Default: None.
-            imgfile_prefix (str | None): The prefix of output image file
-
-        Returns:
-            dict[str: float]: Cityscapes evaluation results.
-        """
-        logger: MMLogger = MMLogger.get_current_instance()
-        try:
-            import cityscapesscripts.evaluation.evalPixelLevelSemanticLabeling as CSEval  # noqa
-        except ImportError:
-            raise ImportError('Please run "pip install cityscapesscripts" to '
-                              'install cityscapesscripts first.')
-        msg = 'Evaluating in Cityscapes style'
-
-        if logger is None:
-            msg = '\n' + msg
-        print_log(msg, logger=logger)
-
-        result_dir = self.suffix
-
-        eval_results = dict()
-        print_log(f'Evaluating results under {result_dir} ...', logger=logger)
-
-        CSEval.args.evalInstLevelScore = True
-        CSEval.args.predictionPath = osp.abspath(result_dir)
-        CSEval.args.evalPixelAccuracy = True
-        CSEval.args.JSONOutput = False
-
-        seg_map_list = []
-        pred_list = []
-        ann_dir = results[0]
-        # when evaluating with official cityscapesscripts,
-        # **_gtFine_labelIds.png is used
-        for seg_map in mmcv.scandir(
-                ann_dir, 'gtFine_labelIds.png', recursive=True):
-            seg_map_list.append(osp.join(ann_dir, seg_map))
-            pred_list.append(CSEval.getPrediction(CSEval.args, seg_map))
-        metric = dict()
-        eval_results.update(
-            CSEval.evaluateImgLists(pred_list, seg_map_list, CSEval.args))
-        metric['averageScoreCategories'] = eval_results[
-            'averageScoreCategories']
-        metric['averageScoreInstCategories'] = eval_results[
-            'averageScoreInstCategories']
-        return metric
-
-    @staticmethod
-    def _convert_to_label_id(result):
-        """Convert trainId to id for cityscapes."""
-        if isinstance(result, str):
-            result = np.load(result)
-        import cityscapesscripts.helpers.labels as CSLabels
-        result_copy = result.copy()
-        for trainId, label in CSLabels.trainId2label.items():
-            result_copy[result == trainId] = label.id
-
-        return result_copy
diff --git a/mmseg/models/__init__.py b/mmseg/models/__init__.py
index 7a520fb2fa..a98951283c 100644
--- a/mmseg/models/__init__.py
+++ b/mmseg/models/__init__.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .assigners import *  # noqa: F401,F403
 from .backbones import *  # noqa: F401,F403
 from .builder import (BACKBONES, HEADS, LOSSES, SEGMENTORS, build_backbone,
                       build_head, build_loss, build_segmentor)
@@ -7,6 +8,7 @@
 from .losses import *  # noqa: F401,F403
 from .necks import *  # noqa: F401,F403
 from .segmentors import *  # noqa: F401,F403
+from .text_encoder import *  # noqa: F401,F403
 
 __all__ = [
     'BACKBONES', 'HEADS', 'LOSSES', 'SEGMENTORS', 'build_backbone',
diff --git a/mmseg/models/assigners/__init__.py b/mmseg/models/assigners/__init__.py
new file mode 100644
index 0000000000..d49b1b18b9
--- /dev/null
+++ b/mmseg/models/assigners/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .base_assigner import BaseAssigner
+from .hungarian_assigner import HungarianAssigner
+from .match_cost import ClassificationCost, CrossEntropyLossCost, DiceCost
+
+__all__ = [
+    'BaseAssigner',
+    'HungarianAssigner',
+    'ClassificationCost',
+    'CrossEntropyLossCost',
+    'DiceCost',
+]
diff --git a/mmseg/models/assigners/base_assigner.py b/mmseg/models/assigners/base_assigner.py
new file mode 100644
index 0000000000..97895cdac2
--- /dev/null
+++ b/mmseg/models/assigners/base_assigner.py
@@ -0,0 +1,18 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import ABCMeta, abstractmethod
+from typing import Optional
+
+from mmengine.structures import InstanceData
+
+
+class BaseAssigner(metaclass=ABCMeta):
+    """Base assigner that assigns masks to ground truth class labels."""
+
+    @abstractmethod
+    def assign(self,
+               pred_instances: InstanceData,
+               gt_instances: InstanceData,
+               gt_instances_ignore: Optional[InstanceData] = None,
+               **kwargs):
+        """Assign masks to either a ground truth class label or a negative
+        label."""
diff --git a/mmseg/models/assigners/hungarian_assigner.py b/mmseg/models/assigners/hungarian_assigner.py
new file mode 100644
index 0000000000..28868f0a04
--- /dev/null
+++ b/mmseg/models/assigners/hungarian_assigner.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Union
+
+import torch
+from mmengine import ConfigDict
+from mmengine.structures import InstanceData
+from scipy.optimize import linear_sum_assignment
+from torch.cuda.amp import autocast
+
+from mmseg.registry import TASK_UTILS
+from .base_assigner import BaseAssigner
+
+
+@TASK_UTILS.register_module()
+class HungarianAssigner(BaseAssigner):
+    """Computes one-to-one matching between prediction masks and ground truth.
+
+    This class uses bipartite matching-based assignment to computes an
+    assignment between the prediction masks and the ground truth. The
+    assignment result is based on the weighted sum of match costs. The
+    Hungarian algorithm is used to calculate the best matching with the
+    minimum cost. The prediction masks that are not matched are classified
+    as background.
+
+    Args:
+        match_costs (ConfigDict|List[ConfigDict]): Match cost configs.
+    """
+
+    def __init__(
+        self, match_costs: Union[List[Union[dict, ConfigDict]], dict,
+                                 ConfigDict]
+    ) -> None:
+
+        if isinstance(match_costs, dict):
+            match_costs = [match_costs]
+        elif isinstance(match_costs, list):
+            assert len(match_costs) > 0, \
+                'match_costs must not be a empty list.'
+
+        self.match_costs = [
+            TASK_UTILS.build(match_cost) for match_cost in match_costs
+        ]
+
+    def assign(self, pred_instances: InstanceData, gt_instances: InstanceData,
+               **kwargs):
+        """Computes one-to-one matching based on the weighted costs.
+
+        This method assign each query prediction to a ground truth or
+        background. The assignment first calculates the cost for each
+        category assigned to each query mask, and then uses the
+        Hungarian algorithm to calculate the minimum cost as the best
+        match.
+
+        Args:
+            pred_instances (InstanceData): Instances of model
+                predictions. It includes "masks", with shape
+                (n, h, w) or (n, l), and "cls", with shape (n, num_classes+1)
+            gt_instances (InstanceData): Ground truth of instance
+                annotations. It includes "labels", with shape (k, ),
+                and "masks", with shape (k, h, w) or (k, l).
+
+        Returns:
+            matched_quiery_inds (Tensor): The indexes of matched quieres.
+            matched_label_inds (Tensor): The indexes of matched labels.
+        """
+        # compute weighted cost
+        cost_list = []
+        with autocast(enabled=False):
+            for match_cost in self.match_costs:
+                cost = match_cost(
+                    pred_instances=pred_instances, gt_instances=gt_instances)
+                cost_list.append(cost)
+            cost = torch.stack(cost_list).sum(dim=0)
+
+        device = cost.device
+        # do Hungarian matching on CPU using linear_sum_assignment
+        cost = cost.detach().cpu()
+        if linear_sum_assignment is None:
+            raise ImportError('Please run "pip install scipy" '
+                              'to install scipy first.')
+
+        matched_quiery_inds, matched_label_inds = linear_sum_assignment(cost)
+        matched_quiery_inds = torch.from_numpy(matched_quiery_inds).to(device)
+        matched_label_inds = torch.from_numpy(matched_label_inds).to(device)
+
+        return matched_quiery_inds, matched_label_inds
diff --git a/mmseg/models/assigners/match_cost.py b/mmseg/models/assigners/match_cost.py
new file mode 100644
index 0000000000..560df85290
--- /dev/null
+++ b/mmseg/models/assigners/match_cost.py
@@ -0,0 +1,231 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from abc import abstractmethod
+from typing import Union
+
+import torch
+import torch.nn.functional as F
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import TASK_UTILS
+
+
+class BaseMatchCost:
+    """Base match cost class.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self, weight: Union[float, int] = 1.) -> None:
+        self.weight = weight
+
+    @abstractmethod
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): Instances of model predictions.
+            It often includes "labels" and "scores".
+            gt_instances (InstanceData): Ground truth of instance
+            annotations. It usually includes "labels".
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        pass
+
+
+@TASK_UTILS.register_module()
+class ClassificationCost(BaseMatchCost):
+    """ClsSoftmaxCost.
+
+    Args:
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+
+    Examples:
+        >>> from mmseg.models.assigners import ClassificationCost
+        >>> import torch
+        >>> self = ClassificationCost()
+        >>> cls_pred = torch.rand(4, 3)
+        >>> gt_labels = torch.tensor([0, 1, 2])
+        >>> factor = torch.tensor([10, 8, 10, 8])
+        >>> self(cls_pred, gt_labels)
+        tensor([[-0.3430, -0.3525, -0.3045],
+            [-0.3077, -0.2931, -0.3992],
+            [-0.3664, -0.3455, -0.2881],
+            [-0.3343, -0.2701, -0.3956]])
+    """
+
+    def __init__(self, weight: Union[float, int] = 1) -> None:
+        super().__init__(weight=weight)
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): "scores" inside is
+                predicted classification logits, of shape
+                (num_queries, num_class).
+            gt_instances (InstanceData): "labels" inside should have
+                shape (num_gt, ).
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'scores'), \
+            "pred_instances must contain 'scores'"
+        assert hasattr(gt_instances, 'labels'), \
+            "gt_instances must contain 'labels'"
+        pred_scores = pred_instances.scores
+        gt_labels = gt_instances.labels
+
+        pred_scores = pred_scores.softmax(-1)
+        cls_cost = -pred_scores[:, gt_labels]
+
+        return cls_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class DiceCost(BaseMatchCost):
+    """Cost of mask assignments based on dice losses.
+
+    Args:
+        pred_act (bool): Whether to apply sigmoid to mask_pred.
+            Defaults to False.
+        eps (float): Defaults to 1e-3.
+        naive_dice (bool): If True, use the naive dice loss
+            in which the power of the number in the denominator is
+            the first power. If False, use the second power that
+            is adopted by K-Net and SOLO. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 pred_act: bool = False,
+                 eps: float = 1e-3,
+                 naive_dice: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.pred_act = pred_act
+        self.eps = eps
+        self.naive_dice = naive_dice
+
+    def _binary_mask_dice_loss(self, mask_preds: Tensor,
+                               gt_masks: Tensor) -> Tensor:
+        """
+        Args:
+            mask_preds (Tensor): Mask prediction in shape (num_queries, *).
+            gt_masks (Tensor): Ground truth in shape (num_gt, *)
+                store 0 or 1, 0 for negative class and 1 for
+                positive class.
+
+        Returns:
+            Tensor: Dice cost matrix in shape (num_queries, num_gt).
+        """
+        mask_preds = mask_preds.flatten(1)
+        gt_masks = gt_masks.flatten(1).float()
+        numerator = 2 * torch.einsum('nc,mc->nm', mask_preds, gt_masks)
+        if self.naive_dice:
+            denominator = mask_preds.sum(-1)[:, None] + \
+                gt_masks.sum(-1)[None, :]
+        else:
+            denominator = mask_preds.pow(2).sum(1)[:, None] + \
+                gt_masks.pow(2).sum(1)[None, :]
+        loss = 1 - (numerator + self.eps) / (denominator + self.eps)
+        return loss
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (InstanceData): Predicted instances which
+                must contain "masks".
+            gt_instances (InstanceData): Ground truth which must contain
+                "mask".
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'masks'), \
+            "pred_instances must contain 'masks'"
+        assert hasattr(gt_instances, 'masks'), \
+            "gt_instances must contain 'masks'"
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+
+        if self.pred_act:
+            pred_masks = pred_masks.sigmoid()
+        dice_cost = self._binary_mask_dice_loss(pred_masks, gt_masks)
+        return dice_cost * self.weight
+
+
+@TASK_UTILS.register_module()
+class CrossEntropyLossCost(BaseMatchCost):
+    """CrossEntropyLossCost.
+
+    Args:
+        use_sigmoid (bool): Whether the prediction uses sigmoid
+                of softmax. Defaults to True.
+        weight (Union[float, int]): Cost weight. Defaults to 1.
+    """
+
+    def __init__(self,
+                 use_sigmoid: bool = True,
+                 weight: Union[float, int] = 1.) -> None:
+        super().__init__(weight=weight)
+        self.use_sigmoid = use_sigmoid
+
+    def _binary_cross_entropy(self, cls_pred: Tensor,
+                              gt_labels: Tensor) -> Tensor:
+        """
+        Args:
+            cls_pred (Tensor): The prediction with shape (num_queries, 1, *) or
+                (num_queries, *).
+            gt_labels (Tensor): The learning label of prediction with
+                shape (num_gt, *).
+
+        Returns:
+            Tensor: Cross entropy cost matrix in shape (num_queries, num_gt).
+        """
+        cls_pred = cls_pred.flatten(1).float()
+        gt_labels = gt_labels.flatten(1).float()
+        n = cls_pred.shape[1]
+        pos = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.ones_like(cls_pred), reduction='none')
+        neg = F.binary_cross_entropy_with_logits(
+            cls_pred, torch.zeros_like(cls_pred), reduction='none')
+        cls_cost = torch.einsum('nc,mc->nm', pos, gt_labels) + \
+            torch.einsum('nc,mc->nm', neg, 1 - gt_labels)
+        cls_cost = cls_cost / n
+
+        return cls_cost
+
+    def __call__(self, pred_instances: InstanceData,
+                 gt_instances: InstanceData, **kwargs) -> Tensor:
+        """Compute match cost.
+
+        Args:
+            pred_instances (:obj:`InstanceData`): Predicted instances which
+                must contain ``masks``.
+            gt_instances (:obj:`InstanceData`): Ground truth which must contain
+                ``masks``.
+
+        Returns:
+            Tensor: Match Cost matrix of shape (num_preds, num_gts).
+        """
+        assert hasattr(pred_instances, 'masks'), \
+            "pred_instances must contain 'masks'"
+        assert hasattr(gt_instances, 'masks'), \
+            "gt_instances must contain 'masks'"
+        pred_masks = pred_instances.masks
+        gt_masks = gt_instances.masks
+        if self.use_sigmoid:
+            cls_cost = self._binary_cross_entropy(pred_masks, gt_masks)
+        else:
+            raise NotImplementedError
+
+        return cls_cost * self.weight
diff --git a/mmseg/models/backbones/__init__.py b/mmseg/models/backbones/__init__.py
index bda42bb692..784d3dfdb7 100644
--- a/mmseg/models/backbones/__init__.py
+++ b/mmseg/models/backbones/__init__.py
@@ -3,6 +3,7 @@
 from .bisenetv1 import BiSeNetV1
 from .bisenetv2 import BiSeNetV2
 from .cgnet import CGNet
+from .ddrnet import DDRNet
 from .erfnet import ERFNet
 from .fast_scnn import FastSCNN
 from .hrnet import HRNet
@@ -11,6 +12,8 @@
 from .mit import MixVisionTransformer
 from .mobilenet_v2 import MobileNetV2
 from .mobilenet_v3 import MobileNetV3
+from .mscan import MSCAN
+from .pidnet import PIDNet
 from .resnest import ResNeSt
 from .resnet import ResNet, ResNetV1c, ResNetV1d
 from .resnext import ResNeXt
@@ -20,11 +23,13 @@
 from .twins import PCPVT, SVT
 from .unet import UNet
 from .vit import VisionTransformer
+from .vpd import VPD
 
 __all__ = [
     'ResNet', 'ResNetV1c', 'ResNetV1d', 'ResNeXt', 'HRNet', 'FastSCNN',
     'ResNeSt', 'MobileNetV2', 'UNet', 'CGNet', 'MobileNetV3',
     'VisionTransformer', 'SwinTransformer', 'MixVisionTransformer',
     'BiSeNetV1', 'BiSeNetV2', 'ICNet', 'TIMMBackbone', 'ERFNet', 'PCPVT',
-    'SVT', 'STDCNet', 'STDCContextPathNet', 'BEiT', 'MAE'
+    'SVT', 'STDCNet', 'STDCContextPathNet', 'BEiT', 'MAE', 'PIDNet', 'MSCAN',
+    'DDRNet', 'VPD'
 ]
diff --git a/mmseg/models/backbones/beit.py b/mmseg/models/backbones/beit.py
index 3b2d1413df..e5da71e729 100644
--- a/mmseg/models/backbones/beit.py
+++ b/mmseg/models/backbones/beit.py
@@ -7,9 +7,11 @@
 import torch.nn.functional as F
 from mmcv.cnn import build_norm_layer
 from mmcv.cnn.bricks.drop import build_dropout
-from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init,
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
                                         trunc_normal_)
-from mmcv.runner import BaseModule, ModuleList, _load_checkpoint
+from mmengine.runner.checkpoint import _load_checkpoint
+from scipy import interpolate
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn.modules.utils import _pair as to_2tuple
 
@@ -17,11 +19,6 @@
 from ..utils import PatchEmbed
 from .vit import TransformerEncoderLayer as VisionTransformerEncoderLayer
 
-try:
-    from scipy import interpolate
-except ImportError:
-    interpolate = None
-
 
 class BEiTAttention(BaseModule):
     """Window based multi-head self-attention (W-MSA) module with relative
@@ -193,7 +190,7 @@ def __init__(self,
                  init_values=None):
         attn_cfg.update(dict(window_size=window_size, qk_scale=None))
 
-        super(BEiTTransformerEncoderLayer, self).__init__(
+        super().__init__(
             embed_dims=embed_dims,
             num_heads=num_heads,
             feedforward_channels=feedforward_channels,
@@ -213,9 +210,9 @@ def __init__(self,
         self.drop_path = build_dropout(
             dropout_layer) if dropout_layer else nn.Identity()
         self.gamma_1 = nn.Parameter(
-            init_values * torch.ones((embed_dims)), requires_grad=True)
+            init_values * torch.ones(embed_dims), requires_grad=True)
         self.gamma_2 = nn.Parameter(
-            init_values * torch.ones((embed_dims)), requires_grad=True)
+            init_values * torch.ones(embed_dims), requires_grad=True)
 
     def build_attn(self, attn_cfg):
         self.attn = BEiTAttention(**attn_cfg)
@@ -286,7 +283,7 @@ def __init__(self,
                  pretrained=None,
                  init_values=0.1,
                  init_cfg=None):
-        super(BEiT, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         if isinstance(img_size, int):
             img_size = to_2tuple(img_size)
         elif isinstance(img_size, tuple):
@@ -504,7 +501,7 @@ def _init_weights(m):
             state_dict = self.resize_rel_pos_embed(checkpoint)
             self.load_state_dict(state_dict, False)
         elif self.init_cfg is not None:
-            super(BEiT, self).init_weights()
+            super().init_weights()
         else:
             # We only implement the 'jax_impl' initialization implemented at
             # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
@@ -550,7 +547,7 @@ def forward(self, inputs):
         return tuple(outs)
 
     def train(self, mode=True):
-        super(BEiT, self).train(mode)
+        super().train(mode)
         if mode and self.norm_eval:
             for m in self.modules():
                 if isinstance(m, nn.LayerNorm):
diff --git a/mmseg/models/backbones/bisenetv1.py b/mmseg/models/backbones/bisenetv1.py
index 4eded90764..ca58bf9c59 100644
--- a/mmseg/models/backbones/bisenetv1.py
+++ b/mmseg/models/backbones/bisenetv1.py
@@ -2,10 +2,10 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 
 
 class SpatialPath(BaseModule):
@@ -29,7 +29,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(SpatialPath, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert len(num_channels) == 4, 'Length of input channels \
                                         of Spatial Path must be 4!'
 
@@ -98,7 +98,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(AttentionRefinementModule, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.conv_layer = ConvModule(
             in_channels=in_channels,
             out_channels=out_channel,
@@ -152,7 +152,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(ContextPath, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert len(context_channels) == 3, 'Length of input channels \
                                            of Context Path must be 3!'
 
@@ -228,7 +228,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(FeatureFusionModule, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.conv1 = ConvModule(
             in_channels=in_channels,
             out_channels=out_channels,
@@ -304,7 +304,7 @@ def __init__(self,
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
 
-        super(BiSeNetV1, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert len(spatial_channels) == 4, 'Length of input channels \
                                            of Spatial Path must be 4!'
 
diff --git a/mmseg/models/backbones/bisenetv2.py b/mmseg/models/backbones/bisenetv2.py
index 50693a4f27..32aa49822f 100644
--- a/mmseg/models/backbones/bisenetv2.py
+++ b/mmseg/models/backbones/bisenetv2.py
@@ -3,10 +3,10 @@
 import torch.nn as nn
 from mmcv.cnn import (ConvModule, DepthwiseSeparableConvModule,
                       build_activation_layer, build_norm_layer)
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 
 
 class DetailBranch(BaseModule):
@@ -37,7 +37,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(DetailBranch, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         detail_branch = []
         for i in range(len(detail_channels)):
             if i == 0:
@@ -126,7 +126,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(StemBlock, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         self.conv_first = ConvModule(
             in_channels=in_channels,
@@ -207,7 +207,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(GELayer, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         mid_channel = in_channels * exp_ratio
         self.conv1 = ConvModule(
             in_channels=in_channels,
@@ -326,7 +326,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(CEBlock, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.in_channels = in_channels
         self.out_channels = out_channels
         self.gap = nn.Sequential(
@@ -385,7 +385,7 @@ def __init__(self,
                  in_channels=3,
                  exp_ratio=6,
                  init_cfg=None):
-        super(SemanticBranch, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.in_channels = in_channels
         self.semantic_channels = semantic_channels
         self.semantic_stages = []
@@ -458,7 +458,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(BGALayer, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.out_channels = out_channels
         self.align_corners = align_corners
         self.detail_dwconv = nn.Sequential(
@@ -594,7 +594,7 @@ def __init__(self,
                 dict(
                     type='Constant', val=1, layer=['_BatchNorm', 'GroupNorm'])
             ]
-        super(BiSeNetV2, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.in_channels = in_channels
         self.out_indices = out_indices
         self.detail_channels = detail_channels
diff --git a/mmseg/models/backbones/cgnet.py b/mmseg/models/backbones/cgnet.py
index a3da0a2aea..b74b494f53 100644
--- a/mmseg/models/backbones/cgnet.py
+++ b/mmseg/models/backbones/cgnet.py
@@ -5,8 +5,8 @@
 import torch.nn as nn
 import torch.utils.checkpoint as cp
 from mmcv.cnn import ConvModule, build_conv_layer, build_norm_layer
-from mmcv.runner import BaseModule
-from mmcv.utils.parrots_wrapper import _BatchNorm
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 
 from mmseg.registry import MODELS
 
@@ -25,7 +25,7 @@ class GlobalContextExtractor(nn.Module):
     """
 
     def __init__(self, channel, reduction=16, with_cp=False):
-        super(GlobalContextExtractor, self).__init__()
+        super().__init__()
         self.channel = channel
         self.reduction = reduction
         assert reduction >= 1 and channel >= reduction
@@ -87,7 +87,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN', requires_grad=True),
                  act_cfg=dict(type='PReLU'),
                  with_cp=False):
-        super(ContextGuidedBlock, self).__init__()
+        super().__init__()
         self.with_cp = with_cp
         self.downsample = downsample
 
@@ -172,7 +172,7 @@ class InputInjection(nn.Module):
     """Downsampling module for CGNet."""
 
     def __init__(self, num_downsampling):
-        super(InputInjection, self).__init__()
+        super().__init__()
         self.pool = nn.ModuleList()
         for i in range(num_downsampling):
             self.pool.append(nn.AvgPool2d(3, stride=2, padding=1))
@@ -230,7 +230,7 @@ def __init__(self,
                  pretrained=None,
                  init_cfg=None):
 
-        super(CGNet, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         assert not (init_cfg and pretrained), \
             'init_cfg and pretrained cannot be setting at the same time'
@@ -364,7 +364,7 @@ def forward(self, x):
     def train(self, mode=True):
         """Convert the model into training mode will keeping the normalization
         layer freezed."""
-        super(CGNet, self).train(mode)
+        super().train(mode)
         if mode and self.norm_eval:
             for m in self.modules():
                 # trick: eval have effect on BatchNorm only
diff --git a/mmseg/models/backbones/ddrnet.py b/mmseg/models/backbones/ddrnet.py
new file mode 100644
index 0000000000..4508aade82
--- /dev/null
+++ b/mmseg/models/backbones/ddrnet.py
@@ -0,0 +1,222 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+from mmengine.model import BaseModule
+
+from mmseg.models.utils import DAPPM, BasicBlock, Bottleneck, resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+
+
+@MODELS.register_module()
+class DDRNet(BaseModule):
+    """DDRNet backbone.
+
+    This backbone is the implementation of `Deep Dual-resolution Networks for
+    Real-time and Accurate Semantic Segmentation of Road Scenes
+    <http://arxiv.org/abs/2101.06085>`_.
+    Modified from https://github.com/ydhongHIT/DDRNet.
+
+    Args:
+        in_channels (int): Number of input image channels. Default: 3.
+        channels: (int): The base channels of DDRNet. Default: 32.
+        ppm_channels (int): The channels of PPM module. Default: 128.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        norm_cfg (dict): Config dict to build norm layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict, optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 channels: int = 32,
+                 ppm_channels: int = 128,
+                 align_corners: bool = False,
+                 norm_cfg: OptConfigType = dict(type='BN', requires_grad=True),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+
+        self.in_channels = in_channels
+        self.ppm_channels = ppm_channels
+
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+
+        # stage 0-2
+        self.stem = self._make_stem_layer(in_channels, channels, num_blocks=2)
+        self.relu = nn.ReLU()
+
+        # low resolution(context) branch
+        self.context_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.context_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    inplanes=channels * 2**(i + 1),
+                    planes=channels * 8 if i > 0 else channels * 4,
+                    num_blocks=2 if i < 2 else 1,
+                    stride=2))
+
+        # bilateral fusion
+        self.compression_1 = ConvModule(
+            channels * 4,
+            channels * 2,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.down_1 = ConvModule(
+            channels * 2,
+            channels * 4,
+            kernel_size=3,
+            stride=2,
+            padding=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+
+        self.compression_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=None)
+        self.down_2 = nn.Sequential(
+            ConvModule(
+                channels * 2,
+                channels * 4,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels * 4,
+                channels * 8,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None))
+
+        # high resolution(spatial) branch
+        self.spatial_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.spatial_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    inplanes=channels * 2,
+                    planes=channels * 2,
+                    num_blocks=2 if i < 2 else 1,
+                ))
+
+        self.spp = DAPPM(
+            channels * 16, ppm_channels, channels * 4, num_scales=5)
+
+    def _make_stem_layer(self, in_channels, channels, num_blocks):
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+
+        layers.extend([
+            self._make_layer(BasicBlock, channels, channels, num_blocks),
+            nn.ReLU(),
+            self._make_layer(
+                BasicBlock, channels, channels * 2, num_blocks, stride=2),
+            nn.ReLU(),
+        ])
+
+        return nn.Sequential(*layers)
+
+    def _make_layer(self, block, inplanes, planes, num_blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes * block.expansion:
+            downsample = nn.Sequential(
+                nn.Conv2d(
+                    inplanes,
+                    planes * block.expansion,
+                    kernel_size=1,
+                    stride=stride,
+                    bias=False),
+                build_norm_layer(self.norm_cfg, planes * block.expansion)[1])
+
+        layers = [
+            block(
+                in_channels=inplanes,
+                channels=planes,
+                stride=stride,
+                downsample=downsample)
+        ]
+        inplanes = planes * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    in_channels=inplanes,
+                    channels=planes,
+                    stride=1,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg_out=None if i == num_blocks - 1 else self.act_cfg))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """Forward function."""
+        out_size = (x.shape[-2] // 8, x.shape[-1] // 8)
+
+        # stage 0-2
+        x = self.stem(x)
+
+        # stage3
+        x_c = self.context_branch_layers[0](x)
+        x_s = self.spatial_branch_layers[0](x)
+        comp_c = self.compression_1(self.relu(x_c))
+        x_c += self.down_1(self.relu(x_s))
+        x_s += resize(
+            comp_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_context = x_s.clone()
+
+        # stage4
+        x_c = self.context_branch_layers[1](self.relu(x_c))
+        x_s = self.spatial_branch_layers[1](self.relu(x_s))
+        comp_c = self.compression_2(self.relu(x_c))
+        x_c += self.down_2(self.relu(x_s))
+        x_s += resize(
+            comp_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        # stage5
+        x_s = self.spatial_branch_layers[2](self.relu(x_s))
+        x_c = self.context_branch_layers[2](self.relu(x_c))
+        x_c = self.spp(x_c)
+        x_c = resize(
+            x_c,
+            size=out_size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        return (temp_context, x_s + x_c) if self.training else x_s + x_c
diff --git a/mmseg/models/backbones/erfnet.py b/mmseg/models/backbones/erfnet.py
index 7c0da2da2c..2c5ec672a0 100644
--- a/mmseg/models/backbones/erfnet.py
+++ b/mmseg/models/backbones/erfnet.py
@@ -2,10 +2,10 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import build_activation_layer, build_conv_layer, build_norm_layer
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 
 
 class DownsamplerBlock(BaseModule):
@@ -35,7 +35,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN', eps=1e-3),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(DownsamplerBlock, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
@@ -95,7 +95,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN', eps=1e-3),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(NonBottleneck1d, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
@@ -168,7 +168,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN', eps=1e-3),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(UpsamplerBlock, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
@@ -242,7 +242,7 @@ def __init__(self,
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
 
-        super(ERFNet, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert len(enc_downsample_channels) \
                == len(dec_upsample_channels)+1, 'Number of downsample\
                      block of encoder does not \
diff --git a/mmseg/models/backbones/fast_scnn.py b/mmseg/models/backbones/fast_scnn.py
index 3d40e46b88..6ff7a3191d 100644
--- a/mmseg/models/backbones/fast_scnn.py
+++ b/mmseg/models/backbones/fast_scnn.py
@@ -2,12 +2,11 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
 from mmseg.models.decode_heads.psp_head import PPM
-from mmseg.ops import resize
 from mmseg.registry import MODELS
-from ..utils import InvertedResidual
+from ..utils import InvertedResidual, resize
 
 
 class LearningToDownsample(nn.Module):
@@ -37,7 +36,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  dw_act_cfg=None):
-        super(LearningToDownsample, self).__init__()
+        super().__init__()
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
@@ -125,7 +124,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  align_corners=False):
-        super(GlobalFeatureExtractor, self).__init__()
+        super().__init__()
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.act_cfg = act_cfg
@@ -221,7 +220,7 @@ def __init__(self,
                  dwconv_act_cfg=dict(type='ReLU'),
                  conv_act_cfg=None,
                  align_corners=False):
-        super(FeatureFusionModule, self).__init__()
+        super().__init__()
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
         self.dwconv_act_cfg = dwconv_act_cfg
@@ -341,7 +340,7 @@ def __init__(self,
                  dw_act_cfg=None,
                  init_cfg=None):
 
-        super(FastSCNN, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         if init_cfg is None:
             self.init_cfg = [
diff --git a/mmseg/models/backbones/hrnet.py b/mmseg/models/backbones/hrnet.py
index dbbd2c8a40..2da755e731 100644
--- a/mmseg/models/backbones/hrnet.py
+++ b/mmseg/models/backbones/hrnet.py
@@ -3,11 +3,11 @@
 
 import torch.nn as nn
 from mmcv.cnn import build_conv_layer, build_norm_layer
-from mmcv.runner import BaseModule, ModuleList, Sequential
-from mmcv.utils.parrots_wrapper import _BatchNorm
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 
-from mmseg.ops import Upsample, resize
 from mmseg.registry import MODELS
+from ..utils import Upsample, resize
 from .resnet import BasicBlock, Bottleneck
 
 
@@ -30,7 +30,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN', requires_grad=True),
                  block_init_cfg=None,
                  init_cfg=None):
-        super(HRModule, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         self.block_init_cfg = block_init_cfg
         self._check_branches(num_branches, num_blocks, in_channels,
                              num_channels)
@@ -308,7 +308,7 @@ def __init__(self,
                  multiscale_output=True,
                  pretrained=None,
                  init_cfg=None):
-        super(HRNet, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         self.pretrained = pretrained
         self.zero_init_residual = zero_init_residual
@@ -633,7 +633,7 @@ def forward(self, x):
     def train(self, mode=True):
         """Convert the model into training mode will keeping the normalization
         layer freezed."""
-        super(HRNet, self).train(mode)
+        super().train(mode)
         self._freeze_stages()
         if mode and self.norm_eval:
             for m in self.modules():
diff --git a/mmseg/models/backbones/icnet.py b/mmseg/models/backbones/icnet.py
index 3cd7037b3f..8ff3448569 100644
--- a/mmseg/models/backbones/icnet.py
+++ b/mmseg/models/backbones/icnet.py
@@ -2,11 +2,11 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
 from ..decode_heads.psp_head import PPM
+from ..utils import resize
 
 
 @MODELS.register_module()
@@ -64,7 +64,7 @@ def __init__(self,
                 dict(type='Constant', val=1, layer='_BatchNorm'),
                 dict(type='Normal', mean=0.01, layer='Linear')
             ]
-        super(ICNet, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.align_corners = align_corners
         self.backbone = MODELS.build(backbone_cfg)
 
diff --git a/mmseg/models/backbones/mae.py b/mmseg/models/backbones/mae.py
index 5989364e25..a1f243f085 100644
--- a/mmseg/models/backbones/mae.py
+++ b/mmseg/models/backbones/mae.py
@@ -3,9 +3,10 @@
 
 import torch
 import torch.nn as nn
-from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init,
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
                                         trunc_normal_)
-from mmcv.runner import ModuleList, _load_checkpoint
+from mmengine.runner.checkpoint import _load_checkpoint
 from torch.nn.modules.batchnorm import _BatchNorm
 
 from mmseg.registry import MODELS
@@ -99,7 +100,7 @@ def __init__(self,
                  pretrained=None,
                  init_values=0.1,
                  init_cfg=None):
-        super(MAE, self).__init__(
+        super().__init__(
             img_size=img_size,
             patch_size=patch_size,
             in_channels=in_channels,
@@ -185,7 +186,7 @@ def _init_weights(m):
             state_dict = self.resize_abs_pos_embed(state_dict)
             self.load_state_dict(state_dict, False)
         elif self.init_cfg is not None:
-            super(MAE, self).init_weights()
+            super().init_weights()
         else:
             # We only implement the 'jax_impl' initialization implemented at
             # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
diff --git a/mmseg/models/backbones/mit.py b/mmseg/models/backbones/mit.py
index 83c3bb3caf..66556bdfca 100644
--- a/mmseg/models/backbones/mit.py
+++ b/mmseg/models/backbones/mit.py
@@ -8,9 +8,9 @@
 from mmcv.cnn import Conv2d, build_activation_layer, build_norm_layer
 from mmcv.cnn.bricks.drop import build_dropout
 from mmcv.cnn.bricks.transformer import MultiheadAttention
-from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
+from mmengine.model import BaseModule, ModuleList, Sequential
+from mmengine.model.weight_init import (constant_init, normal_init,
                                         trunc_normal_init)
-from mmcv.runner import BaseModule, ModuleList, Sequential
 
 from mmseg.registry import MODELS
 from ..utils import PatchEmbed, nchw_to_nlc, nlc_to_nchw
@@ -44,7 +44,7 @@ def __init__(self,
                  ffn_drop=0.,
                  dropout_layer=None,
                  init_cfg=None):
-        super(MixFFN, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         self.embed_dims = embed_dims
         self.feedforward_channels = feedforward_channels
@@ -253,7 +253,7 @@ def __init__(self,
                  batch_first=True,
                  sr_ratio=1,
                  with_cp=False):
-        super(TransformerEncoderLayer, self).__init__()
+        super().__init__()
 
         # The ret[0] of build_norm_layer is norm name.
         self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
@@ -357,7 +357,7 @@ def __init__(self,
                  pretrained=None,
                  init_cfg=None,
                  with_cp=False):
-        super(MixVisionTransformer, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         assert not (init_cfg and pretrained), \
             'init_cfg and pretrained cannot be set at the same time'
@@ -433,7 +433,7 @@ def init_weights(self):
                     normal_init(
                         m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
         else:
-            super(MixVisionTransformer, self).init_weights()
+            super().init_weights()
 
     def forward(self, x):
         outs = []
diff --git a/mmseg/models/backbones/mobilenet_v2.py b/mmseg/models/backbones/mobilenet_v2.py
index 67269182a1..1c21b5df97 100644
--- a/mmseg/models/backbones/mobilenet_v2.py
+++ b/mmseg/models/backbones/mobilenet_v2.py
@@ -3,7 +3,7 @@
 
 import torch.nn as nn
 from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 from torch.nn.modules.batchnorm import _BatchNorm
 
 from mmseg.registry import MODELS
@@ -63,7 +63,7 @@ def __init__(self,
                  with_cp=False,
                  pretrained=None,
                  init_cfg=None):
-        super(MobileNetV2, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         self.pretrained = pretrained
         assert not (init_cfg and pretrained), \
@@ -189,7 +189,7 @@ def _freeze_stages(self):
                 param.requires_grad = False
 
     def train(self, mode=True):
-        super(MobileNetV2, self).train(mode)
+        super().train(mode)
         self._freeze_stages()
         if mode and self.norm_eval:
             for m in self.modules():
diff --git a/mmseg/models/backbones/mobilenet_v3.py b/mmseg/models/backbones/mobilenet_v3.py
index ac73233b0d..1efb6e0974 100644
--- a/mmseg/models/backbones/mobilenet_v3.py
+++ b/mmseg/models/backbones/mobilenet_v3.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import warnings
 
-import mmcv
 from mmcv.cnn import ConvModule
 from mmcv.cnn.bricks import Conv2dAdaptivePadding
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
+from mmengine.utils import is_tuple_of
 from torch.nn.modules.batchnorm import _BatchNorm
 
 from mmseg.registry import MODELS
@@ -81,7 +81,7 @@ def __init__(self,
                  with_cp=False,
                  pretrained=None,
                  init_cfg=None):
-        super(MobileNetV3, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         self.pretrained = pretrained
         assert not (init_cfg and pretrained), \
@@ -104,7 +104,7 @@ def __init__(self,
 
         assert arch in self.arch_settings
         assert isinstance(reduction_factor, int) and reduction_factor > 0
-        assert mmcv.is_tuple_of(out_indices, int)
+        assert is_tuple_of(out_indices, int)
         for index in out_indices:
             if index not in range(0, len(self.arch_settings[arch]) + 2):
                 raise ValueError(
@@ -175,7 +175,7 @@ def _make_layer(self):
                 act_cfg=dict(type=act),
                 with_cp=self.with_cp)
             in_channels = out_channels
-            layer_name = 'layer{}'.format(i + 1)
+            layer_name = f'layer{i + 1}'
             self.add_module(layer_name, layer)
             layers.append(layer_name)
 
@@ -192,7 +192,7 @@ def _make_layer(self):
             conv_cfg=self.conv_cfg,
             norm_cfg=self.norm_cfg,
             act_cfg=dict(type='HSwish'))
-        layer_name = 'layer{}'.format(len(layer_setting) + 1)
+        layer_name = f'layer{len(layer_setting) + 1}'
         self.add_module(layer_name, layer)
         layers.append(layer_name)
 
@@ -259,7 +259,7 @@ def _freeze_stages(self):
                 param.requires_grad = False
 
     def train(self, mode=True):
-        super(MobileNetV3, self).train(mode)
+        super().train(mode)
         self._freeze_stages()
         if mode and self.norm_eval:
             for m in self.modules():
diff --git a/mmseg/models/backbones/mscan.py b/mmseg/models/backbones/mscan.py
new file mode 100644
index 0000000000..7150cb7a1c
--- /dev/null
+++ b/mmseg/models/backbones/mscan.py
@@ -0,0 +1,467 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Originally from https://github.com/visual-attention-network/segnext
+# Licensed under the Apache License, Version 2.0 (the "License")
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_activation_layer, build_norm_layer
+from mmcv.cnn.bricks import DropPath
+from mmengine.model import BaseModule
+from mmengine.model.weight_init import (constant_init, normal_init,
+                                        trunc_normal_init)
+
+from mmseg.registry import MODELS
+
+
+class Mlp(BaseModule):
+    """Multi Layer Perceptron (MLP) Module.
+
+    Args:
+        in_features (int): The dimension of input features.
+        hidden_features (int): The dimension of hidden features.
+            Defaults: None.
+        out_features (int): The dimension of output features.
+            Defaults: None.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        drop (float): The number of dropout rate in MLP block.
+            Defaults: 0.0.
+    """
+
+    def __init__(self,
+                 in_features,
+                 hidden_features=None,
+                 out_features=None,
+                 act_cfg=dict(type='GELU'),
+                 drop=0.):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = nn.Conv2d(in_features, hidden_features, 1)
+        self.dwconv = nn.Conv2d(
+            hidden_features,
+            hidden_features,
+            3,
+            1,
+            1,
+            bias=True,
+            groups=hidden_features)
+        self.act = build_activation_layer(act_cfg)
+        self.fc2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.drop = nn.Dropout(drop)
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.fc1(x)
+
+        x = self.dwconv(x)
+        x = self.act(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+
+        return x
+
+
+class StemConv(BaseModule):
+    """Stem Block at the beginning of Semantic Branch.
+
+    Args:
+        in_channels (int): The dimension of input channels.
+        out_channels (int): The dimension of output channels.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 in_channels,
+                 out_channels,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+
+        self.proj = nn.Sequential(
+            nn.Conv2d(
+                in_channels,
+                out_channels // 2,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            build_norm_layer(norm_cfg, out_channels // 2)[1],
+            build_activation_layer(act_cfg),
+            nn.Conv2d(
+                out_channels // 2,
+                out_channels,
+                kernel_size=(3, 3),
+                stride=(2, 2),
+                padding=(1, 1)),
+            build_norm_layer(norm_cfg, out_channels)[1],
+        )
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.proj(x)
+        _, _, H, W = x.size()
+        x = x.flatten(2).transpose(1, 2)
+        return x, H, W
+
+
+class MSCAAttention(BaseModule):
+    """Attention Module in Multi-Scale Convolutional Attention Module (MSCA).
+
+    Args:
+        channels (int): The dimension of channels.
+        kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+    """
+
+    def __init__(self,
+                 channels,
+                 kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 paddings=[2, [0, 3], [0, 5], [0, 10]]):
+        super().__init__()
+        self.conv0 = nn.Conv2d(
+            channels,
+            channels,
+            kernel_size=kernel_sizes[0],
+            padding=paddings[0],
+            groups=channels)
+        for i, (kernel_size,
+                padding) in enumerate(zip(kernel_sizes[1:], paddings[1:])):
+            kernel_size_ = [kernel_size, kernel_size[::-1]]
+            padding_ = [padding, padding[::-1]]
+            conv_name = [f'conv{i}_1', f'conv{i}_2']
+            for i_kernel, i_pad, i_conv in zip(kernel_size_, padding_,
+                                               conv_name):
+                self.add_module(
+                    i_conv,
+                    nn.Conv2d(
+                        channels,
+                        channels,
+                        tuple(i_kernel),
+                        padding=i_pad,
+                        groups=channels))
+        self.conv3 = nn.Conv2d(channels, channels, 1)
+
+    def forward(self, x):
+        """Forward function."""
+
+        u = x.clone()
+
+        attn = self.conv0(x)
+
+        # Multi-Scale Feature extraction
+        attn_0 = self.conv0_1(attn)
+        attn_0 = self.conv0_2(attn_0)
+
+        attn_1 = self.conv1_1(attn)
+        attn_1 = self.conv1_2(attn_1)
+
+        attn_2 = self.conv2_1(attn)
+        attn_2 = self.conv2_2(attn_2)
+
+        attn = attn + attn_0 + attn_1 + attn_2
+        # Channel Mixing
+        attn = self.conv3(attn)
+
+        # Convolutional Attention
+        x = attn * u
+
+        return x
+
+
+class MSCASpatialAttention(BaseModule):
+    """Spatial Attention Module in Multi-Scale Convolutional Attention Module
+    (MSCA).
+
+    Args:
+        in_channels (int): The dimension of channels.
+        attention_kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+    """
+
+    def __init__(self,
+                 in_channels,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 act_cfg=dict(type='GELU')):
+        super().__init__()
+        self.proj_1 = nn.Conv2d(in_channels, in_channels, 1)
+        self.activation = build_activation_layer(act_cfg)
+        self.spatial_gating_unit = MSCAAttention(in_channels,
+                                                 attention_kernel_sizes,
+                                                 attention_kernel_paddings)
+        self.proj_2 = nn.Conv2d(in_channels, in_channels, 1)
+
+    def forward(self, x):
+        """Forward function."""
+
+        shorcut = x.clone()
+        x = self.proj_1(x)
+        x = self.activation(x)
+        x = self.spatial_gating_unit(x)
+        x = self.proj_2(x)
+        x = x + shorcut
+        return x
+
+
+class MSCABlock(BaseModule):
+    """Basic Multi-Scale Convolutional Attention Block. It leverage the large-
+    kernel attention (LKA) mechanism to build both channel and spatial
+    attention. In each branch, it uses two depth-wise strip convolutions to
+    approximate standard depth-wise convolutions with large kernels. The kernel
+    size for each branch is set to 7, 11, and 21, respectively.
+
+    Args:
+        channels (int): The dimension of channels.
+        attention_kernel_sizes (list): The size of attention
+            kernel. Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): The number of
+            corresponding padding value in attention module.
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        mlp_ratio (float): The ratio of multiple input dimension to
+            calculate hidden feature in MLP layer. Defaults: 4.0.
+        drop (float): The number of dropout rate in MLP block.
+            Defaults: 0.0.
+        drop_path (float): The ratio of drop paths.
+            Defaults: 0.0.
+        act_cfg (dict): Config dict for activation layer in block.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 channels,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+        self.norm1 = build_norm_layer(norm_cfg, channels)[1]
+        self.attn = MSCASpatialAttention(channels, attention_kernel_sizes,
+                                         attention_kernel_paddings, act_cfg)
+        self.drop_path = DropPath(
+            drop_path) if drop_path > 0. else nn.Identity()
+        self.norm2 = build_norm_layer(norm_cfg, channels)[1]
+        mlp_hidden_channels = int(channels * mlp_ratio)
+        self.mlp = Mlp(
+            in_features=channels,
+            hidden_features=mlp_hidden_channels,
+            act_cfg=act_cfg,
+            drop=drop)
+        layer_scale_init_value = 1e-2
+        self.layer_scale_1 = nn.Parameter(
+            layer_scale_init_value * torch.ones(channels), requires_grad=True)
+        self.layer_scale_2 = nn.Parameter(
+            layer_scale_init_value * torch.ones(channels), requires_grad=True)
+
+    def forward(self, x, H, W):
+        """Forward function."""
+
+        B, N, C = x.shape
+        x = x.permute(0, 2, 1).view(B, C, H, W)
+        x = x + self.drop_path(
+            self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) *
+            self.attn(self.norm1(x)))
+        x = x + self.drop_path(
+            self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) *
+            self.mlp(self.norm2(x)))
+        x = x.view(B, C, N).permute(0, 2, 1)
+        return x
+
+
+class OverlapPatchEmbed(BaseModule):
+    """Image to Patch Embedding.
+
+    Args:
+        patch_size (int): The patch size.
+            Defaults: 7.
+        stride (int): Stride of the convolutional layer.
+            Default: 4.
+        in_channels (int): The number of input channels.
+            Defaults: 3.
+        embed_dims (int): The dimensions of embedding.
+            Defaults: 768.
+        norm_cfg (dict): Config dict for normalization layer.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+    """
+
+    def __init__(self,
+                 patch_size=7,
+                 stride=4,
+                 in_channels=3,
+                 embed_dim=768,
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__()
+
+        self.proj = nn.Conv2d(
+            in_channels,
+            embed_dim,
+            kernel_size=patch_size,
+            stride=stride,
+            padding=patch_size // 2)
+        self.norm = build_norm_layer(norm_cfg, embed_dim)[1]
+
+    def forward(self, x):
+        """Forward function."""
+
+        x = self.proj(x)
+        _, _, H, W = x.shape
+        x = self.norm(x)
+
+        x = x.flatten(2).transpose(1, 2)
+
+        return x, H, W
+
+
+@MODELS.register_module()
+class MSCAN(BaseModule):
+    """SegNeXt Multi-Scale Convolutional Attention Network (MCSAN) backbone.
+
+    This backbone is the implementation of `SegNeXt: Rethinking
+    Convolutional Attention Design for Semantic
+    Segmentation <https://arxiv.org/abs/2209.08575>`_.
+    Inspiration from https://github.com/visual-attention-network/segnext.
+
+    Args:
+        in_channels (int): The number of input channels. Defaults: 3.
+        embed_dims (list[int]): Embedding dimension.
+            Defaults: [64, 128, 256, 512].
+        mlp_ratios (list[int]): Ratio of mlp hidden dim to embedding dim.
+            Defaults: [4, 4, 4, 4].
+        drop_rate (float): Dropout rate. Defaults: 0.
+        drop_path_rate (float): Stochastic depth rate. Defaults: 0.
+        depths (list[int]): Depths of each Swin Transformer stage.
+            Default: [3, 4, 6, 3].
+        num_stages (int): MSCAN stages. Default: 4.
+        attention_kernel_sizes (list): Size of attention kernel in
+            Attention Module (Figure 2(b) of original paper).
+            Defaults: [5, [1, 7], [1, 11], [1, 21]].
+        attention_kernel_paddings (list): Size of attention paddings
+            in Attention Module (Figure 2(b) of original paper).
+            Defaults: [2, [0, 3], [0, 5], [0, 10]].
+        norm_cfg (dict): Config of norm layers.
+            Defaults: dict(type='SyncBN', requires_grad=True).
+        pretrained (str, optional): model pretrained path.
+            Default: None.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256, 512],
+                 mlp_ratios=[4, 4, 4, 4],
+                 drop_rate=0.,
+                 drop_path_rate=0.,
+                 depths=[3, 4, 6, 3],
+                 num_stages=4,
+                 attention_kernel_sizes=[5, [1, 7], [1, 11], [1, 21]],
+                 attention_kernel_paddings=[2, [0, 3], [0, 5], [0, 10]],
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True),
+                 pretrained=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depths = depths
+        self.num_stages = num_stages
+
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]  # stochastic depth decay rule
+        cur = 0
+
+        for i in range(num_stages):
+            if i == 0:
+                patch_embed = StemConv(3, embed_dims[0], norm_cfg=norm_cfg)
+            else:
+                patch_embed = OverlapPatchEmbed(
+                    patch_size=7 if i == 0 else 3,
+                    stride=4 if i == 0 else 2,
+                    in_channels=in_channels if i == 0 else embed_dims[i - 1],
+                    embed_dim=embed_dims[i],
+                    norm_cfg=norm_cfg)
+
+            block = nn.ModuleList([
+                MSCABlock(
+                    channels=embed_dims[i],
+                    attention_kernel_sizes=attention_kernel_sizes,
+                    attention_kernel_paddings=attention_kernel_paddings,
+                    mlp_ratio=mlp_ratios[i],
+                    drop=drop_rate,
+                    drop_path=dpr[cur + j],
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg) for j in range(depths[i])
+            ])
+            norm = nn.LayerNorm(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f'patch_embed{i + 1}', patch_embed)
+            setattr(self, f'block{i + 1}', block)
+            setattr(self, f'norm{i + 1}', norm)
+
+    def init_weights(self):
+        """Initialize modules of MSCAN."""
+
+        print('init cfg', self.init_cfg)
+        if self.init_cfg is None:
+            for m in self.modules():
+                if isinstance(m, nn.Linear):
+                    trunc_normal_init(m, std=.02, bias=0.)
+                elif isinstance(m, nn.LayerNorm):
+                    constant_init(m, val=1.0, bias=0.)
+                elif isinstance(m, nn.Conv2d):
+                    fan_out = m.kernel_size[0] * m.kernel_size[
+                        1] * m.out_channels
+                    fan_out //= m.groups
+                    normal_init(
+                        m, mean=0, std=math.sqrt(2.0 / fan_out), bias=0)
+        else:
+            super().init_weights()
+
+    def forward(self, x):
+        """Forward function."""
+
+        B = x.shape[0]
+        outs = []
+
+        for i in range(self.num_stages):
+            patch_embed = getattr(self, f'patch_embed{i + 1}')
+            block = getattr(self, f'block{i + 1}')
+            norm = getattr(self, f'norm{i + 1}')
+            x, H, W = patch_embed(x)
+            for blk in block:
+                x = blk(x, H, W)
+            x = norm(x)
+            x = x.reshape(B, H, W, -1).permute(0, 3, 1, 2).contiguous()
+            outs.append(x)
+
+        return outs
diff --git a/mmseg/models/backbones/pidnet.py b/mmseg/models/backbones/pidnet.py
new file mode 100644
index 0000000000..0b711a3737
--- /dev/null
+++ b/mmseg/models/backbones/pidnet.py
@@ -0,0 +1,522 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from mmengine.runner import CheckpointLoader
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+from ..utils import DAPPM, PAPPM, BasicBlock, Bottleneck
+
+
+class PagFM(BaseModule):
+    """Pixel-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        channels (int): The number of channels.
+        after_relu (bool): Whether to use ReLU before attention.
+            Default: False.
+        with_channel (bool): Whether to use channel attention.
+            Default: False.
+        upsample_mode (str): The mode of upsample. Default: 'bilinear'.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(typ='ReLU', inplace=True).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 after_relu: bool = False,
+                 with_channel: bool = False,
+                 upsample_mode: str = 'bilinear',
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(typ='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.after_relu = after_relu
+        self.with_channel = with_channel
+        self.upsample_mode = upsample_mode
+        self.f_i = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        self.f_p = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        if with_channel:
+            self.up = ConvModule(
+                channels, in_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+        if after_relu:
+            self.relu = MODELS.build(act_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+
+        Returns:
+            Tensor: The feature map with pixel-attention-guided fusion.
+        """
+        if self.after_relu:
+            x_p = self.relu(x_p)
+            x_i = self.relu(x_i)
+
+        f_i = self.f_i(x_i)
+        f_i = F.interpolate(
+            f_i,
+            size=x_p.shape[2:],
+            mode=self.upsample_mode,
+            align_corners=False)
+
+        f_p = self.f_p(x_p)
+
+        if self.with_channel:
+            sigma = torch.sigmoid(self.up(f_p * f_i))
+        else:
+            sigma = torch.sigmoid(torch.sum(f_p * f_i, dim=1).unsqueeze(1))
+
+        x_i = F.interpolate(
+            x_i,
+            size=x_p.shape[2:],
+            mode=self.upsample_mode,
+            align_corners=False)
+
+        out = sigma * x_i + (1 - sigma) * x_p
+        return out
+
+
+class Bag(BaseModule):
+    """Boundary-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        kernel_size (int): The kernel size of the convolution. Default: 3.
+        padding (int): The padding of the convolution. Default: 1.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer.
+            Default: dict(order=('norm', 'act', 'conv')).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 kernel_size: int = 3,
+                 padding: int = 1,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 conv_cfg: OptConfigType = dict(order=('norm', 'act', 'conv')),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+
+        self.conv = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size,
+            padding=padding,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor, x_d: Tensor) -> Tensor:
+        """Forward function.
+
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+            x_d (Tensor): The featrue map from D branch.
+
+        Returns:
+            Tensor: The feature map with boundary-attention-guided fusion.
+        """
+        sigma = torch.sigmoid(x_d)
+        return self.conv(sigma * x_p + (1 - sigma) * x_i)
+
+
+class LightBag(BaseModule):
+    """Light Boundary-attention-guided fusion module.
+
+    Args:
+        in_channels (int): The number of input channels.
+        out_channels (int): The number of output channels.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer. Default: None.
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.f_p = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.f_i = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+
+    def forward(self, x_p: Tensor, x_i: Tensor, x_d: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            x_p (Tensor): The featrue map from P branch.
+            x_i (Tensor): The featrue map from I branch.
+            x_d (Tensor): The featrue map from D branch.
+
+        Returns:
+            Tensor: The feature map with light boundary-attention-guided
+                fusion.
+        """
+        sigma = torch.sigmoid(x_d)
+
+        f_p = self.f_p((1 - sigma) * x_i + x_p)
+        f_i = self.f_i(x_i + sigma * x_p)
+
+        return f_p + f_i
+
+
+@MODELS.register_module()
+class PIDNet(BaseModule):
+    """PIDNet backbone.
+
+    This backbone is the implementation of `PIDNet: A Real-time Semantic
+    Segmentation Network Inspired from PID Controller
+    <https://arxiv.org/abs/2206.02066>`_.
+    Modified from https://github.com/XuJiacong/PIDNet.
+
+    Licensed under the MIT License.
+
+    Args:
+        in_channels (int): The number of input channels. Default: 3.
+        channels (int): The number of channels in the stem layer. Default: 64.
+        ppm_channels (int): The number of channels in the PPM layer.
+            Default: 96.
+        num_stem_blocks (int): The number of blocks in the stem layer.
+            Default: 2.
+        num_branch_blocks (int): The number of blocks in the branch layer.
+            Default: 3.
+        align_corners (bool): The align_corners argument of F.interpolate.
+            Default: False.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict): Config dict for initialization. Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int = 3,
+                 channels: int = 64,
+                 ppm_channels: int = 96,
+                 num_stem_blocks: int = 2,
+                 num_branch_blocks: int = 3,
+                 align_corners: bool = False,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None,
+                 **kwargs):
+        super().__init__(init_cfg)
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.align_corners = align_corners
+
+        # stem layer
+        self.stem = self._make_stem_layer(in_channels, channels,
+                                          num_stem_blocks)
+        self.relu = nn.ReLU()
+
+        # I Branch
+        self.i_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.i_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    in_channels=channels * 2**(i + 1),
+                    channels=channels * 8 if i > 0 else channels * 4,
+                    num_blocks=num_branch_blocks if i < 2 else 2,
+                    stride=2))
+
+        # P Branch
+        self.p_branch_layers = nn.ModuleList()
+        for i in range(3):
+            self.p_branch_layers.append(
+                self._make_layer(
+                    block=BasicBlock if i < 2 else Bottleneck,
+                    in_channels=channels * 2,
+                    channels=channels * 2,
+                    num_blocks=num_stem_blocks if i < 2 else 1))
+        self.compression_1 = ConvModule(
+            channels * 4,
+            channels * 2,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.compression_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.pag_1 = PagFM(channels * 2, channels)
+        self.pag_2 = PagFM(channels * 2, channels)
+
+        # D Branch
+        if num_stem_blocks == 2:
+            self.d_branch_layers = nn.ModuleList([
+                self._make_single_layer(BasicBlock, channels * 2, channels),
+                self._make_layer(Bottleneck, channels, channels, 1)
+            ])
+            channel_expand = 1
+            spp_module = PAPPM
+            dfm_module = LightBag
+            act_cfg_dfm = None
+        else:
+            self.d_branch_layers = nn.ModuleList([
+                self._make_single_layer(BasicBlock, channels * 2,
+                                        channels * 2),
+                self._make_single_layer(BasicBlock, channels * 2, channels * 2)
+            ])
+            channel_expand = 2
+            spp_module = DAPPM
+            dfm_module = Bag
+            act_cfg_dfm = act_cfg
+
+        self.diff_1 = ConvModule(
+            channels * 4,
+            channels * channel_expand,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.diff_2 = ConvModule(
+            channels * 8,
+            channels * 2,
+            kernel_size=3,
+            padding=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+
+        self.spp = spp_module(
+            channels * 16, ppm_channels, channels * 4, num_scales=5)
+        self.dfm = dfm_module(
+            channels * 4, channels * 4, norm_cfg=norm_cfg, act_cfg=act_cfg_dfm)
+
+        self.d_branch_layers.append(
+            self._make_layer(Bottleneck, channels * 2, channels * 2, 1))
+
+    def _make_stem_layer(self, in_channels: int, channels: int,
+                         num_blocks: int) -> nn.Sequential:
+        """Make stem layer.
+
+        Args:
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            num_blocks (int): Number of blocks.
+
+        Returns:
+            nn.Sequential: The stem layer.
+        """
+
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            ConvModule(
+                channels,
+                channels,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+        ]
+
+        layers.append(
+            self._make_layer(BasicBlock, channels, channels, num_blocks))
+        layers.append(nn.ReLU())
+        layers.append(
+            self._make_layer(
+                BasicBlock, channels, channels * 2, num_blocks, stride=2))
+        layers.append(nn.ReLU())
+
+        return nn.Sequential(*layers)
+
+    def _make_layer(self,
+                    block: BasicBlock,
+                    in_channels: int,
+                    channels: int,
+                    num_blocks: int,
+                    stride: int = 1) -> nn.Sequential:
+        """Make layer for PIDNet backbone.
+        Args:
+            block (BasicBlock): Basic block.
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            num_blocks (int): Number of blocks.
+            stride (int): Stride of the first block. Default: 1.
+
+        Returns:
+            nn.Sequential: The Branch Layer.
+        """
+        downsample = None
+        if stride != 1 or in_channels != channels * block.expansion:
+            downsample = ConvModule(
+                in_channels,
+                channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None)
+
+        layers = [block(in_channels, channels, stride, downsample)]
+        in_channels = channels * block.expansion
+        for i in range(1, num_blocks):
+            layers.append(
+                block(
+                    in_channels,
+                    channels,
+                    stride=1,
+                    act_cfg_out=None if i == num_blocks - 1 else self.act_cfg))
+        return nn.Sequential(*layers)
+
+    def _make_single_layer(self,
+                           block: Union[BasicBlock, Bottleneck],
+                           in_channels: int,
+                           channels: int,
+                           stride: int = 1) -> nn.Module:
+        """Make single layer for PIDNet backbone.
+        Args:
+            block (BasicBlock or Bottleneck): Basic block or Bottleneck.
+            in_channels (int): Number of input channels.
+            channels (int): Number of output channels.
+            stride (int): Stride of the first block. Default: 1.
+
+        Returns:
+            nn.Module
+        """
+
+        downsample = None
+        if stride != 1 or in_channels != channels * block.expansion:
+            downsample = ConvModule(
+                in_channels,
+                channels * block.expansion,
+                kernel_size=1,
+                stride=stride,
+                norm_cfg=self.norm_cfg,
+                act_cfg=None)
+        return block(
+            in_channels, channels, stride, downsample, act_cfg_out=None)
+
+    def init_weights(self):
+        """Initialize the weights in backbone.
+
+        Since the D branch is not initialized by the pre-trained model, we
+        initialize it with the same method as the ResNet.
+        """
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+        if self.init_cfg is not None:
+            assert 'checkpoint' in self.init_cfg, f'Only support ' \
+                                                  f'specify `Pretrained` in ' \
+                                                  f'`init_cfg` in ' \
+                                                  f'{self.__class__.__name__} '
+            ckpt = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], map_location='cpu')
+            self.load_state_dict(ckpt, strict=False)
+
+    def forward(self, x: Tensor) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+
+        Args:
+            x (Tensor): Input tensor with shape (B, C, H, W).
+
+        Returns:
+            Tensor or tuple[Tensor]: If self.training is True, return
+                tuple[Tensor], else return Tensor.
+        """
+        w_out = x.shape[-1] // 8
+        h_out = x.shape[-2] // 8
+
+        # stage 0-2
+        x = self.stem(x)
+
+        # stage 3
+        x_i = self.relu(self.i_branch_layers[0](x))
+        x_p = self.p_branch_layers[0](x)
+        x_d = self.d_branch_layers[0](x)
+
+        comp_i = self.compression_1(x_i)
+        x_p = self.pag_1(x_p, comp_i)
+        diff_i = self.diff_1(x_i)
+        x_d += F.interpolate(
+            diff_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_p = x_p.clone()
+
+        # stage 4
+        x_i = self.relu(self.i_branch_layers[1](x_i))
+        x_p = self.p_branch_layers[1](self.relu(x_p))
+        x_d = self.d_branch_layers[1](self.relu(x_d))
+
+        comp_i = self.compression_2(x_i)
+        x_p = self.pag_2(x_p, comp_i)
+        diff_i = self.diff_2(x_i)
+        x_d += F.interpolate(
+            diff_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.training:
+            temp_d = x_d.clone()
+
+        # stage 5
+        x_i = self.i_branch_layers[2](x_i)
+        x_p = self.p_branch_layers[2](self.relu(x_p))
+        x_d = self.d_branch_layers[2](self.relu(x_d))
+
+        x_i = self.spp(x_i)
+        x_i = F.interpolate(
+            x_i,
+            size=[h_out, w_out],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        out = self.dfm(x_p, x_i, x_d)
+        return (temp_p, out, temp_d) if self.training else out
diff --git a/mmseg/models/backbones/resnest.py b/mmseg/models/backbones/resnest.py
index 519bd97382..3cc380b446 100644
--- a/mmseg/models/backbones/resnest.py
+++ b/mmseg/models/backbones/resnest.py
@@ -69,7 +69,7 @@ def __init__(self,
                  conv_cfg=None,
                  norm_cfg=dict(type='BN'),
                  dcn=None):
-        super(SplitAttentionConv2d, self).__init__()
+        super().__init__()
         inter_channels = max(in_channels * radix // reduction_factor, 32)
         self.radix = radix
         self.groups = groups
@@ -174,7 +174,7 @@ def __init__(self,
                  avg_down_stride=True,
                  **kwargs):
         """Bottleneck block for ResNeSt."""
-        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+        super().__init__(inplanes, planes, **kwargs)
 
         if groups == 1:
             width = self.planes
@@ -304,7 +304,7 @@ def __init__(self,
         self.radix = radix
         self.reduction_factor = reduction_factor
         self.avg_down_stride = avg_down_stride
-        super(ResNeSt, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def make_res_layer(self, **kwargs):
         """Pack all blocks in a stage into a ``ResLayer``."""
diff --git a/mmseg/models/backbones/resnet.py b/mmseg/models/backbones/resnet.py
index 9eda906e69..9226c90d85 100644
--- a/mmseg/models/backbones/resnet.py
+++ b/mmseg/models/backbones/resnet.py
@@ -4,8 +4,8 @@
 import torch.nn as nn
 import torch.utils.checkpoint as cp
 from mmcv.cnn import build_conv_layer, build_norm_layer, build_plugin_layer
-from mmcv.runner import BaseModule
-from mmcv.utils.parrots_wrapper import _BatchNorm
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 
 from mmseg.registry import MODELS
 from ..utils import ResLayer
@@ -29,7 +29,7 @@ def __init__(self,
                  dcn=None,
                  plugins=None,
                  init_cfg=None):
-        super(BasicBlock, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         assert dcn is None, 'Not implemented yet.'
         assert plugins is None, 'Not implemented yet.'
 
@@ -118,7 +118,7 @@ def __init__(self,
                  dcn=None,
                  plugins=None,
                  init_cfg=None):
-        super(Bottleneck, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         assert style in ['pytorch', 'caffe']
         assert dcn is None or isinstance(dcn, dict)
         assert plugins is None or isinstance(plugins, list)
@@ -418,7 +418,7 @@ def __init__(self,
                  zero_init_residual=True,
                  pretrained=None,
                  init_cfg=None):
-        super(ResNet, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         if depth not in self.arch_settings:
             raise KeyError(f'invalid depth {depth} for resnet')
 
@@ -676,7 +676,7 @@ def forward(self, x):
     def train(self, mode=True):
         """Convert the model into training mode while keep normalization layer
         freezed."""
-        super(ResNet, self).train(mode)
+        super().train(mode)
         self._freeze_stages()
         if mode and self.norm_eval:
             for m in self.modules():
@@ -696,8 +696,7 @@ class ResNetV1c(ResNet):
     """
 
     def __init__(self, **kwargs):
-        super(ResNetV1c, self).__init__(
-            deep_stem=True, avg_down=False, **kwargs)
+        super().__init__(deep_stem=True, avg_down=False, **kwargs)
 
 
 @MODELS.register_module()
@@ -710,5 +709,4 @@ class ResNetV1d(ResNet):
     """
 
     def __init__(self, **kwargs):
-        super(ResNetV1d, self).__init__(
-            deep_stem=True, avg_down=True, **kwargs)
+        super().__init__(deep_stem=True, avg_down=True, **kwargs)
diff --git a/mmseg/models/backbones/resnext.py b/mmseg/models/backbones/resnext.py
index 2f7cacab7b..67a244a12f 100644
--- a/mmseg/models/backbones/resnext.py
+++ b/mmseg/models/backbones/resnext.py
@@ -23,7 +23,7 @@ def __init__(self,
                  base_width=4,
                  base_channels=64,
                  **kwargs):
-        super(Bottleneck, self).__init__(inplanes, planes, **kwargs)
+        super().__init__(inplanes, planes, **kwargs)
 
         if groups == 1:
             width = self.planes
@@ -139,7 +139,7 @@ class ResNeXt(ResNet):
     def __init__(self, groups=1, base_width=4, **kwargs):
         self.groups = groups
         self.base_width = base_width
-        super(ResNeXt, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
     def make_res_layer(self, **kwargs):
         """Pack all blocks in a stage into a ``ResLayer``"""
diff --git a/mmseg/models/backbones/stdc.py b/mmseg/models/backbones/stdc.py
index ece7da172f..758a3c92e0 100644
--- a/mmseg/models/backbones/stdc.py
+++ b/mmseg/models/backbones/stdc.py
@@ -4,10 +4,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
-from mmcv.runner.base_module import BaseModule, ModuleList, Sequential
+from mmengine.model import BaseModule, ModuleList, Sequential
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .bisenetv1 import AttentionRefinementModule
 
 
@@ -35,7 +35,7 @@ def __init__(self,
                  num_convs=4,
                  fusion_type='add',
                  init_cfg=None):
-        super(STDCModule, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert num_convs > 1
         assert fusion_type in ['add', 'cat']
         self.stride = stride
@@ -155,7 +155,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(FeatureFusionModule, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         channels = out_channels // scale_factor
         self.conv0 = ConvModule(
             in_channels, out_channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg)
@@ -240,7 +240,7 @@ def __init__(self,
                  with_final_conv=False,
                  pretrained=None,
                  init_cfg=None):
-        super(STDCNet, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert stdc_type in self.arch_settings, \
             f'invalid structure {stdc_type} for STDCNet.'
         assert bottleneck_type in ['add', 'cat'],\
@@ -370,7 +370,7 @@ def __init__(self,
                  align_corners=None,
                  norm_cfg=dict(type='BN'),
                  init_cfg=None):
-        super(STDCContextPathNet, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.backbone = MODELS.build(backbone_cfg)
         self.arms = ModuleList()
         self.convs = ModuleList()
diff --git a/mmseg/models/backbones/swin.py b/mmseg/models/backbones/swin.py
index ca8a71f0d3..67b28a96e1 100644
--- a/mmseg/models/backbones/swin.py
+++ b/mmseg/models/backbones/swin.py
@@ -9,12 +9,12 @@
 import torch.utils.checkpoint as cp
 from mmcv.cnn import build_norm_layer
 from mmcv.cnn.bricks.transformer import FFN, build_dropout
-from mmcv.cnn.utils.weight_init import (constant_init, trunc_normal_,
-                                        trunc_normal_init)
-from mmcv.runner import (BaseModule, CheckpointLoader, ModuleList,
-                         load_state_dict)
-from mmcv.utils import to_2tuple
 from mmengine.logging import print_log
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
+                                        trunc_normal_init)
+from mmengine.runner import CheckpointLoader
+from mmengine.utils import to_2tuple
 
 from mmseg.registry import MODELS
 from ..utils.embed import PatchEmbed, PatchMerging
@@ -326,7 +326,7 @@ def __init__(self,
                  with_cp=False,
                  init_cfg=None):
 
-        super(SwinBlock, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         self.with_cp = with_cp
 
@@ -561,7 +561,7 @@ def __init__(self,
         else:
             raise TypeError('pretrained must be a str or None')
 
-        super(SwinTransformer, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         num_layers = len(depths)
         self.out_indices = out_indices
@@ -636,7 +636,7 @@ def __init__(self,
 
     def train(self, mode=True):
         """Convert the model into training mode while keep layers freezed."""
-        super(SwinTransformer, self).train(mode)
+        super().train(mode)
         self._freeze_stages()
 
     def _freeze_stages(self):
@@ -716,23 +716,25 @@ def init_weights(self):
             ]
             for table_key in relative_position_bias_table_keys:
                 table_pretrained = state_dict[table_key]
-                table_current = self.state_dict()[table_key]
-                L1, nH1 = table_pretrained.size()
-                L2, nH2 = table_current.size()
-                if nH1 != nH2:
-                    print_log(f'Error in loading {table_key}, pass')
-                elif L1 != L2:
-                    S1 = int(L1**0.5)
-                    S2 = int(L2**0.5)
-                    table_pretrained_resized = F.interpolate(
-                        table_pretrained.permute(1, 0).reshape(1, nH1, S1, S1),
-                        size=(S2, S2),
-                        mode='bicubic')
-                    state_dict[table_key] = table_pretrained_resized.view(
-                        nH2, L2).permute(1, 0).contiguous()
+                if table_key in self.state_dict():
+                    table_current = self.state_dict()[table_key]
+                    L1, nH1 = table_pretrained.size()
+                    L2, nH2 = table_current.size()
+                    if nH1 != nH2:
+                        print_log(f'Error in loading {table_key}, pass')
+                    elif L1 != L2:
+                        S1 = int(L1**0.5)
+                        S2 = int(L2**0.5)
+                        table_pretrained_resized = F.interpolate(
+                            table_pretrained.permute(1, 0).reshape(
+                                1, nH1, S1, S1),
+                            size=(S2, S2),
+                            mode='bicubic')
+                        state_dict[table_key] = table_pretrained_resized.view(
+                            nH2, L2).permute(1, 0).contiguous()
 
             # load state_dict
-            load_state_dict(self, state_dict, strict=False, logger=None)
+            self.load_state_dict(state_dict, strict=False)
 
     def forward(self, x):
         x, hw_shape = self.patch_embed(x)
diff --git a/mmseg/models/backbones/timm_backbone.py b/mmseg/models/backbones/timm_backbone.py
index 478e8bdea4..1eef302bdd 100644
--- a/mmseg/models/backbones/timm_backbone.py
+++ b/mmseg/models/backbones/timm_backbone.py
@@ -4,8 +4,8 @@
 except ImportError:
     timm = None
 
-from mmcv.cnn.bricks.registry import NORM_LAYERS
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
+from mmengine.registry import MODELS as MMENGINE_MODELS
 
 from mmseg.registry import MODELS
 
@@ -37,9 +37,9 @@ def __init__(
     ):
         if timm is None:
             raise RuntimeError('timm is not installed')
-        super(TIMMBackbone, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         if 'norm_layer' in kwargs:
-            kwargs['norm_layer'] = NORM_LAYERS.get(kwargs['norm_layer'])
+            kwargs['norm_layer'] = MMENGINE_MODELS.get(kwargs['norm_layer'])
         self.timm_model = timm.create_model(
             model_name=model_name,
             features_only=features_only,
diff --git a/mmseg/models/backbones/twins.py b/mmseg/models/backbones/twins.py
index ce1faaa219..b6a6eea795 100644
--- a/mmseg/models/backbones/twins.py
+++ b/mmseg/models/backbones/twins.py
@@ -8,9 +8,9 @@
 from mmcv.cnn import build_norm_layer
 from mmcv.cnn.bricks.drop import build_dropout
 from mmcv.cnn.bricks.transformer import FFN
-from mmcv.cnn.utils.weight_init import (constant_init, normal_init,
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, normal_init,
                                         trunc_normal_init)
-from mmcv.runner import BaseModule, ModuleList
 from torch.nn.modules.batchnorm import _BatchNorm
 
 from mmseg.models.backbones.mit import EfficientMultiheadAttention
@@ -62,7 +62,7 @@ def __init__(self,
                  norm_cfg=dict(type='LN'),
                  sr_ratio=1,
                  init_cfg=None):
-        super(GlobalSubsampledAttention, self).__init__(
+        super().__init__(
             embed_dims,
             num_heads,
             attn_drop=attn_drop,
@@ -112,7 +112,7 @@ def __init__(self,
                  norm_cfg=dict(type='LN'),
                  sr_ratio=1.,
                  init_cfg=None):
-        super(GSAEncoderLayer, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
         self.attn = GlobalSubsampledAttention(
@@ -172,7 +172,7 @@ def __init__(self,
                  proj_drop_rate=0.,
                  window_size=1,
                  init_cfg=None):
-        super(LocallyGroupedSelfAttention, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         assert embed_dims % num_heads == 0, f'dim {embed_dims} should be ' \
                                             f'divided by num_heads ' \
@@ -284,7 +284,7 @@ def __init__(self,
                  window_size=1,
                  init_cfg=None):
 
-        super(LSAEncoderLayer, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         self.norm1 = build_norm_layer(norm_cfg, embed_dims, postfix=1)[1]
         self.attn = LocallyGroupedSelfAttention(embed_dims, num_heads,
@@ -325,7 +325,7 @@ class ConditionalPositionEncoding(BaseModule):
     """
 
     def __init__(self, in_channels, embed_dims=768, stride=1, init_cfg=None):
-        super(ConditionalPositionEncoding, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.proj = nn.Conv2d(
             in_channels,
             embed_dims,
@@ -401,7 +401,7 @@ def __init__(self,
                  norm_after_stage=False,
                  pretrained=None,
                  init_cfg=None):
-        super(PCPVT, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert not (init_cfg and pretrained), \
             'init_cfg and pretrained cannot be set at the same time'
         if isinstance(pretrained, str):
@@ -471,7 +471,7 @@ def __init__(self,
 
     def init_weights(self):
         if self.init_cfg is not None:
-            super(PCPVT, self).init_weights()
+            super().init_weights()
         else:
             for m in self.modules():
                 if isinstance(m, nn.Linear):
@@ -563,11 +563,11 @@ def __init__(self,
                  norm_after_stage=True,
                  pretrained=None,
                  init_cfg=None):
-        super(SVT, self).__init__(in_channels, embed_dims, patch_sizes,
-                                  strides, num_heads, mlp_ratios, out_indices,
-                                  qkv_bias, drop_rate, attn_drop_rate,
-                                  drop_path_rate, norm_cfg, depths, sr_ratios,
-                                  norm_after_stage, pretrained, init_cfg)
+        super().__init__(in_channels, embed_dims, patch_sizes, strides,
+                         num_heads, mlp_ratios, out_indices, qkv_bias,
+                         drop_rate, attn_drop_rate, drop_path_rate, norm_cfg,
+                         depths, sr_ratios, norm_after_stage, pretrained,
+                         init_cfg)
         # transformer encoder
         dpr = [
             x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
diff --git a/mmseg/models/backbones/unet.py b/mmseg/models/backbones/unet.py
index b07edd5f24..545921db8e 100644
--- a/mmseg/models/backbones/unet.py
+++ b/mmseg/models/backbones/unet.py
@@ -3,14 +3,12 @@
 
 import torch.nn as nn
 import torch.utils.checkpoint as cp
-from mmcv.cnn import (UPSAMPLE_LAYERS, ConvModule, build_activation_layer,
-                      build_norm_layer)
-from mmcv.runner import BaseModule
-from mmcv.utils.parrots_wrapper import _BatchNorm
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 
-from mmseg.ops import Upsample
 from mmseg.registry import MODELS
-from ..utils import UpConvBlock
+from ..utils import UpConvBlock, Upsample
 
 
 class BasicConvBlock(nn.Module):
@@ -55,7 +53,7 @@ def __init__(self,
                  act_cfg=dict(type='ReLU'),
                  dcn=None,
                  plugins=None):
-        super(BasicConvBlock, self).__init__()
+        super().__init__()
         assert dcn is None, 'Not implemented yet.'
         assert plugins is None, 'Not implemented yet.'
 
@@ -86,7 +84,7 @@ def forward(self, x):
         return out
 
 
-@UPSAMPLE_LAYERS.register_module()
+@MODELS.register_module()
 class DeconvModule(nn.Module):
     """Deconvolution upsample module in decoder for UNet (2X upsample).
 
@@ -114,7 +112,7 @@ def __init__(self,
                  *,
                  kernel_size=4,
                  scale_factor=2):
-        super(DeconvModule, self).__init__()
+        super().__init__()
 
         assert (kernel_size - scale_factor >= 0) and\
                (kernel_size - scale_factor) % 2 == 0,\
@@ -147,7 +145,7 @@ def forward(self, x):
         return out
 
 
-@UPSAMPLE_LAYERS.register_module()
+@MODELS.register_module()
 class InterpConv(nn.Module):
     """Interpolation upsample module in decoder for UNet.
 
@@ -193,7 +191,7 @@ def __init__(self,
                  padding=0,
                  upsample_cfg=dict(
                      scale_factor=2, mode='bilinear', align_corners=False)):
-        super(InterpConv, self).__init__()
+        super().__init__()
 
         self.with_cp = with_cp
         conv = ConvModule(
@@ -300,7 +298,7 @@ def __init__(self,
                  plugins=None,
                  pretrained=None,
                  init_cfg=None):
-        super(UNet, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         self.pretrained = pretrained
         assert not (init_cfg and pretrained), \
@@ -398,7 +396,7 @@ def __init__(self,
                     act_cfg=act_cfg,
                     dcn=None,
                     plugins=None))
-            self.encoder.append((nn.Sequential(*enc_conv_block)))
+            self.encoder.append(nn.Sequential(*enc_conv_block))
             in_channels = base_channels * 2**i
 
     def forward(self, x):
@@ -417,7 +415,7 @@ def forward(self, x):
     def train(self, mode=True):
         """Convert the model into training mode while keep normalization layer
         freezed."""
-        super(UNet, self).train(mode)
+        super().train(mode)
         if mode and self.norm_eval:
             for m in self.modules():
                 # trick: eval have effect on BatchNorm only
diff --git a/mmseg/models/backbones/vit.py b/mmseg/models/backbones/vit.py
index 7757d5064a..dd0f688fcc 100644
--- a/mmseg/models/backbones/vit.py
+++ b/mmseg/models/backbones/vit.py
@@ -7,17 +7,16 @@
 import torch.utils.checkpoint as cp
 from mmcv.cnn import build_norm_layer
 from mmcv.cnn.bricks.transformer import FFN, MultiheadAttention
-from mmcv.cnn.utils.weight_init import (constant_init, kaiming_init,
-                                        trunc_normal_)
-from mmcv.runner import (BaseModule, CheckpointLoader, ModuleList,
-                         load_state_dict)
 from mmengine.logging import print_log
+from mmengine.model import BaseModule, ModuleList
+from mmengine.model.weight_init import (constant_init, kaiming_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
 from torch.nn.modules.batchnorm import _BatchNorm
 from torch.nn.modules.utils import _pair as to_2tuple
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
-from ..utils import PatchEmbed
+from ..utils import PatchEmbed, resize
 
 
 class TransformerEncoderLayer(BaseModule):
@@ -61,7 +60,7 @@ def __init__(self,
                  attn_cfg=dict(),
                  ffn_cfg=dict(),
                  with_cp=False):
-        super(TransformerEncoderLayer, self).__init__()
+        super().__init__()
 
         self.norm1_name, norm1 = build_norm_layer(
             norm_cfg, embed_dims, postfix=1)
@@ -133,12 +132,16 @@ class VisionTransformer(BaseModule):
     Args:
         img_size (int | tuple): Input image size. Default: 224.
         patch_size (int): The patch size. Default: 16.
+        patch_pad  (str | int | None): The padding method in patch embedding.
+            Default: 'corner'.
         in_channels (int): Number of input channels. Default: 3.
         embed_dims (int): embedding dimension. Default: 768.
         num_layers (int): depth of transformer. Default: 12.
         num_heads (int): number of attention heads. Default: 12.
         mlp_ratio (int): ratio of mlp hidden dim to embedding dim.
             Default: 4.
+        out_origin (bool): Whether to output the original input embedding.
+            Default: False
         out_indices (list | tuple | int): Output from which stages.
             Default: -1.
         qkv_bias (bool): enable bias for qkv if True. Default: True.
@@ -155,8 +158,12 @@ class VisionTransformer(BaseModule):
             Default: dict(type='LN')
         act_cfg (dict): The activation config for FFNs.
             Default: dict(type='GELU').
+        patch_bias (dict): Whether use bias in convolution of PatchEmbed Block.
+            Default: True.
         patch_norm (bool): Whether to add a norm in PatchEmbed Block.
             Default: False.
+        pre_norm (bool): Whether to add a norm before Transformer Layers.
+            Default: False.
         final_norm (bool): Whether to add a additional layer to normalize
             final feature map. Default: False.
         interpolate_mode (str): Select the interpolate mode for position
@@ -168,6 +175,8 @@ class VisionTransformer(BaseModule):
             and its variants only. Default: False.
         with_cp (bool): Use checkpoint or not. Using checkpoint will save
             some memory while slowing down the training speed. Default: False.
+        frozen_exclude (List): List of parameters that are not to be frozen.
+            Default: ["all"], "all" means there are no frozen parameters.
         pretrained (str, optional): model pretrained path. Default: None.
         init_cfg (dict or list[dict], optional): Initialization config dict.
             Default: None.
@@ -176,11 +185,13 @@ class VisionTransformer(BaseModule):
     def __init__(self,
                  img_size=224,
                  patch_size=16,
+                 patch_pad='corner',
                  in_channels=3,
                  embed_dims=768,
                  num_layers=12,
                  num_heads=12,
                  mlp_ratio=4,
+                 out_origin=False,
                  out_indices=-1,
                  qkv_bias=True,
                  drop_rate=0.,
@@ -191,14 +202,17 @@ def __init__(self,
                  norm_cfg=dict(type='LN'),
                  act_cfg=dict(type='GELU'),
                  patch_norm=False,
+                 patch_bias=False,
+                 pre_norm=False,
                  final_norm=False,
                  interpolate_mode='bicubic',
                  num_fcs=2,
                  norm_eval=False,
                  with_cp=False,
+                 frozen_exclude=['all'],
                  pretrained=None,
                  init_cfg=None):
-        super(VisionTransformer, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         if isinstance(img_size, int):
             img_size = to_2tuple(img_size)
@@ -228,6 +242,8 @@ def __init__(self,
         self.norm_eval = norm_eval
         self.with_cp = with_cp
         self.pretrained = pretrained
+        self.out_origin = out_origin
+        self.frozen_exclude = frozen_exclude
 
         self.patch_embed = PatchEmbed(
             in_channels=in_channels,
@@ -235,7 +251,8 @@ def __init__(self,
             conv_type='Conv2d',
             kernel_size=patch_size,
             stride=patch_size,
-            padding='corner',
+            padding=patch_pad,
+            bias=patch_bias,
             norm_cfg=norm_cfg if patch_norm else None,
             init_cfg=None,
         )
@@ -249,6 +266,12 @@ def __init__(self,
         self.pos_embed = nn.Parameter(
             torch.zeros(1, num_patches + 1, embed_dims))
         self.drop_after_pos = nn.Dropout(p=drop_rate)
+        self.pre_norm = pre_norm
+
+        if self.pre_norm:
+            self.pre_ln_name, pre_ln = build_norm_layer(
+                norm_cfg, embed_dims, postfix='_pre')
+            self.add_module(self.pre_ln_name, pre_ln)
 
         if isinstance(out_indices, int):
             if out_indices == -1:
@@ -286,20 +309,36 @@ def __init__(self,
                 norm_cfg, embed_dims, postfix=1)
             self.add_module(self.norm1_name, norm1)
 
+        self._freeze()
+
+    @property
+    def pre_ln(self):
+        return getattr(self, self.pre_ln_name)
+
     @property
     def norm1(self):
         return getattr(self, self.norm1_name)
 
     def init_weights(self):
-        if (isinstance(self.init_cfg, dict)
-                and self.init_cfg.get('type') == 'Pretrained'):
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') in ['Pretrained', 'Pretrained_Part']:
             checkpoint = CheckpointLoader.load_checkpoint(
                 self.init_cfg['checkpoint'], logger=None, map_location='cpu')
 
-            if 'state_dict' in checkpoint:
-                state_dict = checkpoint['state_dict']
-            else:
-                state_dict = checkpoint
+            if self.init_cfg.get('type') == 'Pretrained':
+                if 'state_dict' in checkpoint:
+                    state_dict = checkpoint['state_dict']
+                else:
+                    state_dict = checkpoint
+
+            elif self.init_cfg.get('type') == 'Pretrained_Part':
+                state_dict = checkpoint.copy()
+                para_prefix = 'image_encoder'
+                prefix_len = len(para_prefix) + 1
+                for k, v in checkpoint.items():
+                    state_dict.pop(k)
+                    if para_prefix in k:
+                        state_dict[k[prefix_len:]] = v
 
             if 'pos_embed' in state_dict.keys():
                 if self.pos_embed.shape != state_dict['pos_embed'].shape:
@@ -316,7 +355,7 @@ def init_weights(self):
 
             load_state_dict(self, state_dict, strict=False, logger=None)
         elif self.init_cfg is not None:
-            super(VisionTransformer, self).init_weights()
+            super().init_weights()
         else:
             # We only implement the 'jax_impl' initialization implemented at
             # https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/vision_transformer.py#L353  # noqa: E501
@@ -335,8 +374,15 @@ def init_weights(self):
                 elif isinstance(m, (_BatchNorm, nn.GroupNorm, nn.LayerNorm)):
                     constant_init(m, val=1.0, bias=0.)
 
+    def _freeze(self):
+        if 'all' in self.frozen_exclude:
+            return
+        for name, param in self.named_parameters():
+            if not any([exclude in name for exclude in self.frozen_exclude]):
+                param.requires_grad = False
+
     def _pos_embeding(self, patched_img, hw_shape, pos_embed):
-        """Positiong embeding method.
+        """Positioning embeding method.
 
         Resize the pos_embed, if the input image size doesn't match
             the training size.
@@ -410,7 +456,23 @@ def forward(self, inputs):
             # Remove class token for transformer encoder input
             x = x[:, 1:]
 
+        if self.pre_norm:
+            x = self.pre_ln(x)
+
         outs = []
+        if self.out_origin:
+            if self.with_cls_token:
+                # Remove class token and reshape token for decoder head
+                out = x[:, 1:]
+            else:
+                out = x
+            B, _, C = out.shape
+            out = out.reshape(B, hw_shape[0], hw_shape[1],
+                              C).permute(0, 3, 1, 2).contiguous()
+            if self.output_cls_token:
+                out = [out, x[:, 0]]
+            outs.append(out)
+
         for i, layer in enumerate(self.layers):
             x = layer(x)
             if i == len(self.layers) - 1:
@@ -432,7 +494,7 @@ def forward(self, inputs):
         return tuple(outs)
 
     def train(self, mode=True):
-        super(VisionTransformer, self).train(mode)
+        super().train(mode)
         if mode and self.norm_eval:
             for m in self.modules():
                 if isinstance(m, nn.LayerNorm):
diff --git a/mmseg/models/backbones/vpd.py b/mmseg/models/backbones/vpd.py
new file mode 100644
index 0000000000..e0536d31c6
--- /dev/null
+++ b/mmseg/models/backbones/vpd.py
@@ -0,0 +1,395 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# ------------------------------------------------------------------------------
+# Adapted from https://github.com/wl-zhao/VPD/blob/main/vpd/models.py
+# Original licence: MIT License
+# ------------------------------------------------------------------------------
+
+import math
+from typing import List, Optional, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+from mmengine.runner import CheckpointLoader, load_checkpoint
+
+from mmseg.registry import MODELS
+from mmseg.utils import ConfigType, OptConfigType
+
+try:
+    from ldm.modules.diffusionmodules.util import timestep_embedding
+    from ldm.util import instantiate_from_config
+    has_ldm = True
+except ImportError:
+    has_ldm = False
+
+
+def register_attention_control(model, controller):
+    """Registers a control function to manage attention within a model.
+
+    Args:
+        model: The model to which attention is to be registered.
+        controller: The control function responsible for managing attention.
+    """
+
+    def ca_forward(self, place_in_unet):
+        """Custom forward method for attention.
+
+        Args:
+            self: Reference to the current object.
+            place_in_unet: The location in UNet (down/mid/up).
+
+        Returns:
+            The modified forward method.
+        """
+
+        def forward(x, context=None, mask=None):
+            h = self.heads
+            is_cross = context is not None
+            context = context or x  # if context is None, use x
+
+            q, k, v = self.to_q(x), self.to_k(context), self.to_v(context)
+            q, k, v = (
+                tensor.view(tensor.shape[0] * h, tensor.shape[1],
+                            tensor.shape[2] // h) for tensor in [q, k, v])
+
+            sim = torch.matmul(q, k.transpose(-2, -1)) * self.scale
+
+            if mask is not None:
+                mask = mask.flatten(1).unsqueeze(1).repeat(h, 1, 1)
+                max_neg_value = -torch.finfo(sim.dtype).max
+                sim.masked_fill_(~mask, max_neg_value)
+
+            attn = sim.softmax(dim=-1)
+            attn_mean = attn.view(h, attn.shape[0] // h,
+                                  *attn.shape[1:]).mean(0)
+            controller(attn_mean, is_cross, place_in_unet)
+
+            out = torch.matmul(attn, v)
+            out = out.view(out.shape[0] // h, out.shape[1], out.shape[2] * h)
+            return self.to_out(out)
+
+        return forward
+
+    def register_recr(net_, count, place_in_unet):
+        """Recursive function to register the custom forward method to all
+        CrossAttention layers.
+
+        Args:
+            net_: The network layer currently being processed.
+            count: The current count of layers processed.
+            place_in_unet: The location in UNet (down/mid/up).
+
+        Returns:
+            The updated count of layers processed.
+        """
+        if net_.__class__.__name__ == 'CrossAttention':
+            net_.forward = ca_forward(net_, place_in_unet)
+            return count + 1
+        if hasattr(net_, 'children'):
+            return sum(
+                register_recr(child, 0, place_in_unet)
+                for child in net_.children())
+        return count
+
+    cross_att_count = sum(
+        register_recr(net[1], 0, place) for net, place in [
+            (child, 'down') if 'input_blocks' in name else (
+                child, 'up') if 'output_blocks' in name else
+            (child,
+             'mid') if 'middle_block' in name else (None, None)  # Default case
+            for name, child in model.diffusion_model.named_children()
+        ] if net is not None)
+
+    controller.num_att_layers = cross_att_count
+
+
+class AttentionStore:
+    """A class for storing attention information in the UNet model.
+
+    Attributes:
+        base_size (int): Base size for storing attention information.
+        max_size (int): Maximum size for storing attention information.
+    """
+
+    def __init__(self, base_size=64, max_size=None):
+        """Initialize AttentionStore with default or custom sizes."""
+        self.reset()
+        self.base_size = base_size
+        self.max_size = max_size or (base_size // 2)
+        self.num_att_layers = -1
+
+    @staticmethod
+    def get_empty_store():
+        """Returns an empty store for holding attention values."""
+        return {
+            key: []
+            for key in [
+                'down_cross', 'mid_cross', 'up_cross', 'down_self', 'mid_self',
+                'up_self'
+            ]
+        }
+
+    def reset(self):
+        """Resets the step and attention stores to their initial states."""
+        self.cur_step = 0
+        self.cur_att_layer = 0
+        self.step_store = self.get_empty_store()
+        self.attention_store = {}
+
+    def forward(self, attn, is_cross: bool, place_in_unet: str):
+        """Processes a single forward step, storing the attention.
+
+        Args:
+            attn: The attention tensor.
+            is_cross (bool): Whether it's cross attention.
+            place_in_unet (str): The location in UNet (down/mid/up).
+
+        Returns:
+            The unmodified attention tensor.
+        """
+        key = f"{place_in_unet}_{'cross' if is_cross else 'self'}"
+        if attn.shape[1] <= (self.max_size)**2:
+            self.step_store[key].append(attn)
+        return attn
+
+    def between_steps(self):
+        """Processes and stores attention information between steps."""
+        if not self.attention_store:
+            self.attention_store = self.step_store
+        else:
+            for key in self.attention_store:
+                self.attention_store[key] = [
+                    stored + step for stored, step in zip(
+                        self.attention_store[key], self.step_store[key])
+                ]
+        self.step_store = self.get_empty_store()
+
+    def get_average_attention(self):
+        """Calculates and returns the average attention across all steps."""
+        return {
+            key: [item for item in self.step_store[key]]
+            for key in self.step_store
+        }
+
+    def __call__(self, attn, is_cross: bool, place_in_unet: str):
+        """Allows the class instance to be callable."""
+        return self.forward(attn, is_cross, place_in_unet)
+
+    @property
+    def num_uncond_att_layers(self):
+        """Returns the number of unconditional attention layers (default is
+        0)."""
+        return 0
+
+    def step_callback(self, x_t):
+        """A placeholder for a step callback.
+
+        Returns the input unchanged.
+        """
+        return x_t
+
+
+class UNetWrapper(nn.Module):
+    """A wrapper for UNet with optional attention mechanisms.
+
+    Args:
+        unet (nn.Module): The UNet model to wrap
+        use_attn (bool): Whether to use attention. Defaults to True
+        base_size (int): Base size for the attention store. Defaults to 512
+        max_attn_size (int, optional): Maximum size for the attention store.
+            Defaults to None
+        attn_selector (str): The types of attention to use.
+            Defaults to 'up_cross+down_cross'
+    """
+
+    def __init__(self,
+                 unet,
+                 use_attn=True,
+                 base_size=512,
+                 max_attn_size=None,
+                 attn_selector='up_cross+down_cross'):
+        super().__init__()
+
+        assert has_ldm, 'To use UNetWrapper, please install required ' \
+            'packages via `pip install -r requirements/optional.txt`.'
+
+        self.unet = unet
+        self.attention_store = AttentionStore(
+            base_size=base_size // 8, max_size=max_attn_size)
+        self.attn_selector = attn_selector.split('+')
+        self.use_attn = use_attn
+        self.init_sizes(base_size)
+        if self.use_attn:
+            register_attention_control(unet, self.attention_store)
+
+    def init_sizes(self, base_size):
+        """Initialize sizes based on the base size."""
+        self.size16 = base_size // 32
+        self.size32 = base_size // 16
+        self.size64 = base_size // 8
+
+    def forward(self, x, timesteps=None, context=None, y=None, **kwargs):
+        """Forward pass through the model."""
+        diffusion_model = self.unet.diffusion_model
+        if self.use_attn:
+            self.attention_store.reset()
+        hs, emb, out_list = self._unet_forward(x, timesteps, context, y,
+                                               diffusion_model)
+        if self.use_attn:
+            self._append_attn_to_output(out_list)
+        return out_list[::-1]
+
+    def _unet_forward(self, x, timesteps, context, y, diffusion_model):
+        hs = []
+        t_emb = timestep_embedding(
+            timesteps, diffusion_model.model_channels, repeat_only=False)
+        emb = diffusion_model.time_embed(t_emb)
+        h = x.type(diffusion_model.dtype)
+        for module in diffusion_model.input_blocks:
+            h = module(h, emb, context)
+            hs.append(h)
+        h = diffusion_model.middle_block(h, emb, context)
+        out_list = []
+        for i_out, module in enumerate(diffusion_model.output_blocks):
+            h = torch.cat([h, hs.pop()], dim=1)
+            h = module(h, emb, context)
+            if i_out in [1, 4, 7]:
+                out_list.append(h)
+        h = h.type(x.dtype)
+        out_list.append(h)
+        return hs, emb, out_list
+
+    def _append_attn_to_output(self, out_list):
+        avg_attn = self.attention_store.get_average_attention()
+        attns = {self.size16: [], self.size32: [], self.size64: []}
+        for k in self.attn_selector:
+            for up_attn in avg_attn[k]:
+                size = int(math.sqrt(up_attn.shape[1]))
+                up_attn = up_attn.transpose(-1, -2).reshape(
+                    *up_attn.shape[:2], size, -1)
+                attns[size].append(up_attn)
+        attn16 = torch.stack(attns[self.size16]).mean(0)
+        attn32 = torch.stack(attns[self.size32]).mean(0)
+        attn64 = torch.stack(attns[self.size64]).mean(0) if len(
+            attns[self.size64]) > 0 else None
+        out_list[1] = torch.cat([out_list[1], attn16], dim=1)
+        out_list[2] = torch.cat([out_list[2], attn32], dim=1)
+        if attn64 is not None:
+            out_list[3] = torch.cat([out_list[3], attn64], dim=1)
+
+
+class TextAdapter(nn.Module):
+    """A PyTorch Module that serves as a text adapter.
+
+    This module takes text embeddings and adjusts them based on a scaling
+    factor gamma.
+    """
+
+    def __init__(self, text_dim=768):
+        super().__init__()
+        self.fc = nn.Sequential(
+            nn.Linear(text_dim, text_dim), nn.GELU(),
+            nn.Linear(text_dim, text_dim))
+
+    def forward(self, texts, gamma):
+        texts_after = self.fc(texts)
+        texts = texts + gamma * texts_after
+        return texts
+
+
+@MODELS.register_module()
+class VPD(BaseModule):
+    """VPD (Visual Perception Diffusion) model.
+
+    .. _`VPD`: https://arxiv.org/abs/2303.02153
+
+    Args:
+        diffusion_cfg (dict): Configuration for diffusion model.
+        class_embed_path (str): Path for class embeddings.
+        unet_cfg (dict, optional): Configuration for U-Net.
+        gamma (float, optional): Gamma for text adaptation. Defaults to 1e-4.
+        class_embed_select (bool, optional): If True, enables class embedding
+            selection. Defaults to False.
+        pad_shape (Optional[Union[int, List[int]]], optional): Padding shape.
+            Defaults to None.
+        pad_val (Union[int, List[int]], optional): Padding value.
+            Defaults to 0.
+        init_cfg (dict, optional): Configuration for network initialization.
+    """
+
+    def __init__(self,
+                 diffusion_cfg: ConfigType,
+                 class_embed_path: str,
+                 unet_cfg: OptConfigType = dict(),
+                 gamma: float = 1e-4,
+                 class_embed_select=False,
+                 pad_shape: Optional[Union[int, List[int]]] = None,
+                 pad_val: Union[int, List[int]] = 0,
+                 init_cfg: OptConfigType = None):
+
+        super().__init__(init_cfg=init_cfg)
+
+        assert has_ldm, 'To use VPD model, please install required packages' \
+            ' via `pip install -r requirements/optional.txt`.'
+
+        if pad_shape is not None:
+            if not isinstance(pad_shape, (list, tuple)):
+                pad_shape = (pad_shape, pad_shape)
+
+        self.pad_shape = pad_shape
+        self.pad_val = pad_val
+
+        # diffusion model
+        diffusion_checkpoint = diffusion_cfg.pop('checkpoint', None)
+        sd_model = instantiate_from_config(diffusion_cfg)
+        if diffusion_checkpoint is not None:
+            load_checkpoint(sd_model, diffusion_checkpoint, strict=False)
+
+        self.encoder_vq = sd_model.first_stage_model
+        self.unet = UNetWrapper(sd_model.model, **unet_cfg)
+
+        # class embeddings & text adapter
+        class_embeddings = CheckpointLoader.load_checkpoint(class_embed_path)
+        text_dim = class_embeddings.size(-1)
+        self.text_adapter = TextAdapter(text_dim=text_dim)
+        self.class_embed_select = class_embed_select
+        if class_embed_select:
+            class_embeddings = torch.cat(
+                (class_embeddings, class_embeddings.mean(dim=0,
+                                                         keepdims=True)),
+                dim=0)
+        self.register_buffer('class_embeddings', class_embeddings)
+        self.gamma = nn.Parameter(torch.ones(text_dim) * gamma)
+
+    def forward(self, x):
+        """Extract features from images."""
+
+        # calculate cross-attn map
+        if self.class_embed_select:
+            if isinstance(x, (tuple, list)):
+                x, class_ids = x[:2]
+                class_ids = class_ids.tolist()
+            else:
+                class_ids = [-1] * x.size(0)
+            class_embeddings = self.class_embeddings[class_ids]
+            c_crossattn = self.text_adapter(class_embeddings, self.gamma)
+            c_crossattn = c_crossattn.unsqueeze(1)
+        else:
+            class_embeddings = self.class_embeddings
+            c_crossattn = self.text_adapter(class_embeddings, self.gamma)
+            c_crossattn = c_crossattn.unsqueeze(0).repeat(x.size(0), 1, 1)
+
+        # pad to required input shape for pretrained diffusion model
+        if self.pad_shape is not None:
+            pad_width = max(0, self.pad_shape[1] - x.shape[-1])
+            pad_height = max(0, self.pad_shape[0] - x.shape[-2])
+            x = F.pad(x, (0, pad_width, 0, pad_height), value=self.pad_val)
+
+        # forward the denoising model
+        with torch.no_grad():
+            latents = self.encoder_vq.encode(x).mode().detach()
+        t = torch.ones((x.shape[0], ), device=x.device).long()
+        outs = self.unet(latents, t, context=c_crossattn)
+
+        return outs
diff --git a/mmseg/models/data_preprocessor.py b/mmseg/models/data_preprocessor.py
index 000baf6a5e..8d32bc647b 100644
--- a/mmseg/models/data_preprocessor.py
+++ b/mmseg/models/data_preprocessor.py
@@ -1,13 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from numbers import Number
-from typing import List, Optional, Sequence, Tuple
+from typing import Any, Dict, List, Optional, Sequence
 
 import torch
 from mmengine.model import BaseDataPreprocessor
-from torch import Tensor
 
 from mmseg.registry import MODELS
-from mmseg.utils import OptSampleList, stack_batch
+from mmseg.utils import stack_batch
 
 
 @MODELS.register_module()
@@ -49,18 +48,24 @@ class SegDataPreProcessor(BaseDataPreprocessor):
         rgb_to_bgr (bool): whether to convert image from RGB to RGB.
             Defaults to False.
         batch_augments (list[dict], optional): Batch-level augmentations
+        test_cfg (dict, optional): The padding size config in testing, if not
+            specify, will use `size` and `size_divisor` params as default.
+            Defaults to None, only supports keys `size` or `size_divisor`.
     """
 
-    def __init__(self,
-                 mean: Sequence[Number] = None,
-                 std: Sequence[Number] = None,
-                 size: Optional[tuple] = None,
-                 size_divisor: Optional[int] = None,
-                 pad_val: Number = 0,
-                 seg_pad_val: Number = 255,
-                 bgr_to_rgb: bool = False,
-                 rgb_to_bgr: bool = False,
-                 batch_augments: Optional[List[dict]] = None):
+    def __init__(
+        self,
+        mean: Sequence[Number] = None,
+        std: Sequence[Number] = None,
+        size: Optional[tuple] = None,
+        size_divisor: Optional[int] = None,
+        pad_val: Number = 0,
+        seg_pad_val: Number = 255,
+        bgr_to_rgb: bool = False,
+        rgb_to_bgr: bool = False,
+        batch_augments: Optional[List[dict]] = None,
+        test_cfg: dict = None,
+    ):
         super().__init__()
         self.size = size
         self.size_divisor = size_divisor
@@ -87,46 +92,60 @@ def __init__(self,
         # TODO: support batch augmentations.
         self.batch_augments = batch_augments
 
-    def forward(self,
-                data: Sequence[dict],
-                training: bool = False) -> Tuple[Tensor, OptSampleList]:
+        # Support different padding methods in testing
+        self.test_cfg = test_cfg
+
+    def forward(self, data: dict, training: bool = False) -> Dict[str, Any]:
         """Perform normalization、padding and bgr2rgb conversion based on
         ``BaseDataPreprocessor``.
 
         Args:
-            data (Sequence[dict]): data sampled from dataloader.
+            data (dict): data sampled from dataloader.
             training (bool): Whether to enable training time augmentation.
 
         Returns:
-            Tuple[torch.Tensor, Optional[list]]: Data in the same format as the
-            model input.
+            Dict: Data in the same format as the model input.
         """
-        inputs, batch_data_samples = self.collate_data(data)
-
+        data = self.cast_data(data)  # type: ignore
+        inputs = data['inputs']
+        data_samples = data.get('data_samples', None)
         # TODO: whether normalize should be after stack_batch
         if self.channel_conversion and inputs[0].size(0) == 3:
             inputs = [_input[[2, 1, 0], ...] for _input in inputs]
 
+        inputs = [_input.float() for _input in inputs]
         if self._enable_normalize:
             inputs = [(_input - self.mean) / self.std for _input in inputs]
-        else:
-            inputs = [_input.float() for _input in inputs]
 
         if training:
-            batch_inputs, batch_data_samples = stack_batch(
+            assert data_samples is not None, ('During training, ',
+                                              '`data_samples` must be define.')
+            inputs, data_samples = stack_batch(
                 inputs=inputs,
-                batch_data_samples=batch_data_samples,
+                data_samples=data_samples,
                 size=self.size,
                 size_divisor=self.size_divisor,
                 pad_val=self.pad_val,
                 seg_pad_val=self.seg_pad_val)
 
             if self.batch_augments is not None:
-                inputs, batch_data_samples = self.batch_augments(
-                    inputs, batch_data_samples)
-            return batch_inputs, batch_data_samples
+                inputs, data_samples = self.batch_augments(
+                    inputs, data_samples)
         else:
-            assert len(inputs) == 1, (
-                'Batch inference is not support currently, '
-                'as the image size might be different in a batch')
-            return torch.stack(inputs, dim=0), batch_data_samples
+            img_size = inputs[0].shape[1:]
+            assert all(input_.shape[1:] == img_size for input_ in inputs),  \
+                'The image size in a batch should be the same.'
+            # pad images when testing
+            if self.test_cfg:
+                inputs, padded_samples = stack_batch(
+                    inputs=inputs,
+                    size=self.test_cfg.get('size', None),
+                    size_divisor=self.test_cfg.get('size_divisor', None),
+                    pad_val=self.pad_val,
+                    seg_pad_val=self.seg_pad_val)
+                for data_sample, pad_info in zip(data_samples, padded_samples):
+                    data_sample.set_metainfo({**pad_info})
+            else:
+                inputs = torch.stack(inputs, dim=0)
+
+        return dict(inputs=inputs, data_samples=data_samples)
diff --git a/mmseg/models/decode_heads/__init__.py b/mmseg/models/decode_heads/__init__.py
index 8add7615c2..4229763816 100644
--- a/mmseg/models/decode_heads/__init__.py
+++ b/mmseg/models/decode_heads/__init__.py
@@ -4,6 +4,7 @@
 from .aspp_head import ASPPHead
 from .cc_head import CCHead
 from .da_head import DAHead
+from .ddr_head import DDRHead
 from .dm_head import DMHead
 from .dnl_head import DNLHead
 from .dpt_head import DPTHead
@@ -12,14 +13,19 @@
 from .fcn_head import FCNHead
 from .fpn_head import FPNHead
 from .gc_head import GCHead
+from .ham_head import LightHamHead
 from .isa_head import ISAHead
 from .knet_head import IterativeDecodeHead, KernelUpdateHead, KernelUpdator
 from .lraspp_head import LRASPPHead
+from .mask2former_head import Mask2FormerHead
+from .maskformer_head import MaskFormerHead
 from .nl_head import NLHead
 from .ocr_head import OCRHead
+from .pid_head import PIDHead
 from .point_head import PointHead
 from .psa_head import PSAHead
 from .psp_head import PSPHead
+from .san_head import SideAdapterCLIPHead
 from .segformer_head import SegformerHead
 from .segmenter_mask_head import SegmenterMaskTransformerHead
 from .sep_aspp_head import DepthwiseSeparableASPPHead
@@ -28,6 +34,7 @@
 from .setr_up_head import SETRUPHead
 from .stdc_head import STDCHead
 from .uper_head import UPerHead
+from .vpd_depth_head import VPDDepthHead
 
 __all__ = [
     'FCNHead', 'PSPHead', 'ASPPHead', 'PSAHead', 'NLHead', 'GCHead', 'CCHead',
@@ -36,5 +43,6 @@
     'PointHead', 'APCHead', 'DMHead', 'LRASPPHead', 'SETRUPHead',
     'SETRMLAHead', 'DPTHead', 'SETRMLAHead', 'SegmenterMaskTransformerHead',
     'SegformerHead', 'ISAHead', 'STDCHead', 'IterativeDecodeHead',
-    'KernelUpdateHead', 'KernelUpdator'
+    'KernelUpdateHead', 'KernelUpdator', 'MaskFormerHead', 'Mask2FormerHead',
+    'LightHamHead', 'PIDHead', 'DDRHead', 'VPDDepthHead', 'SideAdapterCLIPHead'
 ]
diff --git a/mmseg/models/decode_heads/ann_head.py b/mmseg/models/decode_heads/ann_head.py
index 9cc791b261..2b40ef5aa1 100644
--- a/mmseg/models/decode_heads/ann_head.py
+++ b/mmseg/models/decode_heads/ann_head.py
@@ -17,7 +17,7 @@ class PPMConcat(nn.ModuleList):
     """
 
     def __init__(self, pool_scales=(1, 3, 6, 8)):
-        super(PPMConcat, self).__init__(
+        super().__init__(
             [nn.AdaptiveAvgPool2d(pool_scale) for pool_scale in pool_scales])
 
     def forward(self, feats):
@@ -58,7 +58,7 @@ def __init__(self, low_in_channels, high_in_channels, channels,
             query_downsample = nn.MaxPool2d(kernel_size=query_scale)
         else:
             query_downsample = None
-        super(SelfAttentionBlock, self).__init__(
+        super().__init__(
             key_in_channels=low_in_channels,
             query_in_channels=high_in_channels,
             channels=channels,
@@ -100,7 +100,7 @@ class AFNB(nn.Module):
     def __init__(self, low_in_channels, high_in_channels, channels,
                  out_channels, query_scales, key_pool_scales, conv_cfg,
                  norm_cfg, act_cfg):
-        super(AFNB, self).__init__()
+        super().__init__()
         self.stages = nn.ModuleList()
         for query_scale in query_scales:
             self.stages.append(
@@ -150,7 +150,7 @@ class APNB(nn.Module):
 
     def __init__(self, in_channels, channels, out_channels, query_scales,
                  key_pool_scales, conv_cfg, norm_cfg, act_cfg):
-        super(APNB, self).__init__()
+        super().__init__()
         self.stages = nn.ModuleList()
         for query_scale in query_scales:
             self.stages.append(
@@ -201,8 +201,7 @@ def __init__(self,
                  query_scales=(1, ),
                  key_pool_scales=(1, 3, 6, 8),
                  **kwargs):
-        super(ANNHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
+        super().__init__(input_transform='multiple_select', **kwargs)
         assert len(self.in_channels) == 2
         low_in_channels, high_in_channels = self.in_channels
         self.project_channels = project_channels
diff --git a/mmseg/models/decode_heads/apc_head.py b/mmseg/models/decode_heads/apc_head.py
index 45f4e2850a..728f39659c 100644
--- a/mmseg/models/decode_heads/apc_head.py
+++ b/mmseg/models/decode_heads/apc_head.py
@@ -4,8 +4,8 @@
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .decode_head import BaseDecodeHead
 
 
@@ -25,7 +25,7 @@ class ACM(nn.Module):
 
     def __init__(self, pool_scale, fusion, in_channels, channels, conv_cfg,
                  norm_cfg, act_cfg):
-        super(ACM, self).__init__()
+        super().__init__()
         self.pool_scale = pool_scale
         self.fusion = fusion
         self.in_channels = in_channels
@@ -123,7 +123,7 @@ class APCHead(BaseDecodeHead):
     """
 
     def __init__(self, pool_scales=(1, 2, 3, 6), fusion=True, **kwargs):
-        super(APCHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert isinstance(pool_scales, (list, tuple))
         self.pool_scales = pool_scales
         self.fusion = fusion
diff --git a/mmseg/models/decode_heads/aspp_head.py b/mmseg/models/decode_heads/aspp_head.py
index acf9eedfa6..6d7185d7de 100644
--- a/mmseg/models/decode_heads/aspp_head.py
+++ b/mmseg/models/decode_heads/aspp_head.py
@@ -3,8 +3,8 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .decode_head import BaseDecodeHead
 
 
@@ -22,7 +22,7 @@ class ASPPModule(nn.ModuleList):
 
     def __init__(self, dilations, in_channels, channels, conv_cfg, norm_cfg,
                  act_cfg):
-        super(ASPPModule, self).__init__()
+        super().__init__()
         self.dilations = dilations
         self.in_channels = in_channels
         self.channels = channels
@@ -63,7 +63,7 @@ class ASPPHead(BaseDecodeHead):
     """
 
     def __init__(self, dilations=(1, 6, 12, 18), **kwargs):
-        super(ASPPHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert isinstance(dilations, (list, tuple))
         self.dilations = dilations
         self.image_pool = nn.Sequential(
diff --git a/mmseg/models/decode_heads/cascade_decode_head.py b/mmseg/models/decode_heads/cascade_decode_head.py
index 82d6c3af45..fe2bcb9302 100644
--- a/mmseg/models/decode_heads/cascade_decode_head.py
+++ b/mmseg/models/decode_heads/cascade_decode_head.py
@@ -13,7 +13,7 @@ class BaseCascadeDecodeHead(BaseDecodeHead, metaclass=ABCMeta):
     :class:`CascadeEncoderDecoder."""
 
     def __init__(self, *args, **kwargs):
-        super(BaseCascadeDecodeHead, self).__init__(*args, **kwargs)
+        super().__init__(*args, **kwargs)
 
     @abstractmethod
     def forward(self, inputs, prev_output):
diff --git a/mmseg/models/decode_heads/cc_head.py b/mmseg/models/decode_heads/cc_head.py
index 03ad3db76a..e9075a2648 100644
--- a/mmseg/models/decode_heads/cc_head.py
+++ b/mmseg/models/decode_heads/cc_head.py
@@ -26,7 +26,7 @@ def __init__(self, recurrence=2, **kwargs):
         if CrissCrossAttention is None:
             raise RuntimeError('Please install mmcv-full for '
                                'CrissCrossAttention ops')
-        super(CCHead, self).__init__(num_convs=2, **kwargs)
+        super().__init__(num_convs=2, **kwargs)
         self.recurrence = recurrence
         self.cca = CrissCrossAttention(self.channels)
 
diff --git a/mmseg/models/decode_heads/da_head.py b/mmseg/models/decode_heads/da_head.py
index 6a58e256af..d87214365d 100644
--- a/mmseg/models/decode_heads/da_head.py
+++ b/mmseg/models/decode_heads/da_head.py
@@ -21,7 +21,7 @@ class PAM(_SelfAttentionBlock):
     """
 
     def __init__(self, in_channels, channels):
-        super(PAM, self).__init__(
+        super().__init__(
             key_in_channels=in_channels,
             query_in_channels=in_channels,
             channels=channels,
@@ -43,7 +43,7 @@ def __init__(self, in_channels, channels):
 
     def forward(self, x):
         """Forward function."""
-        out = super(PAM, self).forward(x, x)
+        out = super().forward(x, x)
 
         out = self.gamma(out) + x
         return out
@@ -53,7 +53,7 @@ class CAM(nn.Module):
     """Channel Attention Module (CAM)"""
 
     def __init__(self):
-        super(CAM, self).__init__()
+        super().__init__()
         self.gamma = Scale(0)
 
     def forward(self, x):
@@ -86,7 +86,7 @@ class DAHead(BaseDecodeHead):
     """
 
     def __init__(self, pam_channels, **kwargs):
-        super(DAHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.pam_channels = pam_channels
         self.pam_in_conv = ConvModule(
             self.in_channels,
@@ -173,15 +173,12 @@ def loss_by_feat(self, seg_logit: Tuple[Tensor],
         loss = dict()
         loss.update(
             add_prefix(
-                super(DAHead, self).loss_by_feat(pam_cam_seg_logit,
-                                                 batch_data_samples),
+                super().loss_by_feat(pam_cam_seg_logit, batch_data_samples),
                 'pam_cam'))
         loss.update(
-            add_prefix(
-                super(DAHead, self).loss_by_feat(pam_seg_logit,
-                                                 batch_data_samples), 'pam'))
+            add_prefix(super().loss_by_feat(pam_seg_logit, batch_data_samples),
+                       'pam'))
         loss.update(
-            add_prefix(
-                super(DAHead, self).loss_by_feat(cam_seg_logit,
-                                                 batch_data_samples), 'cam'))
+            add_prefix(super().loss_by_feat(cam_seg_logit, batch_data_samples),
+                       'cam'))
         return loss
diff --git a/mmseg/models/decode_heads/ddr_head.py b/mmseg/models/decode_heads/ddr_head.py
new file mode 100644
index 0000000000..ba26d6503c
--- /dev/null
+++ b/mmseg/models/decode_heads/ddr_head.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Tuple, Union
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.models.losses import accuracy
+from mmseg.models.utils import resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType, SampleList
+
+
+@MODELS.register_module()
+class DDRHead(BaseDecodeHead):
+    """Decode head for DDRNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        num_classes (int): Number of classes.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 num_classes: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 **kwargs):
+        super().__init__(
+            in_channels,
+            channels,
+            num_classes=num_classes,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **kwargs)
+
+        self.head = self._make_base_head(self.in_channels, self.channels)
+        self.aux_head = self._make_base_head(self.in_channels // 2,
+                                             self.channels)
+        self.aux_cls_seg = nn.Conv2d(
+            self.channels, self.out_channels, kernel_size=1)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(
+            self,
+            inputs: Union[Tensor,
+                          Tuple[Tensor]]) -> Union[Tensor, Tuple[Tensor]]:
+        if self.training:
+            c3_feat, c5_feat = inputs
+            x_c = self.head(c5_feat)
+            x_c = self.cls_seg(x_c)
+            x_s = self.aux_head(c3_feat)
+            x_s = self.aux_cls_seg(x_s)
+
+            return x_c, x_s
+        else:
+            x_c = self.head(inputs)
+            x_c = self.cls_seg(x_c)
+            return x_c
+
+    def _make_base_head(self, in_channels: int,
+                        channels: int) -> nn.Sequential:
+        layers = [
+            ConvModule(
+                in_channels,
+                channels,
+                kernel_size=3,
+                padding=1,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg,
+                order=('norm', 'act', 'conv')),
+            build_norm_layer(self.norm_cfg, channels)[1],
+            build_activation_layer(self.act_cfg),
+        ]
+
+        return nn.Sequential(*layers)
+
+    def loss_by_feat(self, seg_logits: Tuple[Tensor],
+                     batch_data_samples: SampleList) -> dict:
+        loss = dict()
+        context_logit, spatial_logit = seg_logits
+        seg_label = self._stack_batch_gt(batch_data_samples)
+
+        context_logit = resize(
+            context_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        spatial_logit = resize(
+            spatial_logit,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        seg_label = seg_label.squeeze(1)
+
+        loss['loss_context'] = self.loss_decode[0](context_logit, seg_label)
+        loss['loss_spatial'] = self.loss_decode[1](spatial_logit, seg_label)
+        loss['acc_seg'] = accuracy(
+            context_logit, seg_label, ignore_index=self.ignore_index)
+
+        return loss
diff --git a/mmseg/models/decode_heads/decode_head.py b/mmseg/models/decode_heads/decode_head.py
index 1a3cf3f3a0..179d871fd1 100644
--- a/mmseg/models/decode_heads/decode_head.py
+++ b/mmseg/models/decode_heads/decode_head.py
@@ -1,17 +1,18 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import warnings
 from abc import ABCMeta, abstractmethod
 from typing import List, Tuple
 
 import torch
 import torch.nn as nn
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 from torch import Tensor
 
-from mmseg.data import build_pixel_sampler
-from mmseg.ops import resize
+from mmseg.structures import build_pixel_sampler
 from mmseg.utils import ConfigType, SampleList
 from ..builder import build_loss
 from ..losses import accuracy
+from ..utils import resize
 
 
 class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
@@ -44,6 +45,9 @@ class BaseDecodeHead(BaseModule, metaclass=ABCMeta):
         in_channels (int|Sequence[int]): Input channels.
         channels (int): Channels after modules, before conv_seg.
         num_classes (int): Number of classes.
+        out_channels (int): Output channels of conv_seg. Default: None.
+        threshold (float): Threshold for binary segmentation in the case of
+            `num_classes==1`. Default: None.
         dropout_ratio (float): Ratio of dropout layer. Default: 0.1.
         conv_cfg (dict|None): Config of conv layers. Default: None.
         norm_cfg (dict|None): Config of norm layers. Default: None.
@@ -82,6 +86,8 @@ def __init__(self,
                  channels,
                  *,
                  num_classes,
+                 out_channels=None,
+                 threshold=None,
                  dropout_ratio=0.1,
                  conv_cfg=None,
                  norm_cfg=None,
@@ -97,10 +103,9 @@ def __init__(self,
                  align_corners=False,
                  init_cfg=dict(
                      type='Normal', std=0.01, override=dict(name='conv_seg'))):
-        super(BaseDecodeHead, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         self._init_inputs(in_channels, in_index, input_transform)
         self.channels = channels
-        self.num_classes = num_classes
         self.dropout_ratio = dropout_ratio
         self.conv_cfg = conv_cfg
         self.norm_cfg = norm_cfg
@@ -110,6 +115,30 @@ def __init__(self,
         self.ignore_index = ignore_index
         self.align_corners = align_corners
 
+        if out_channels is None:
+            if num_classes == 2:
+                warnings.warn('For binary segmentation, we suggest using'
+                              '`out_channels = 1` to define the output'
+                              'channels of segmentor, and use `threshold`'
+                              'to convert `seg_logits` into a prediction'
+                              'applying a threshold')
+            out_channels = num_classes
+
+        if out_channels != num_classes and out_channels != 1:
+            raise ValueError(
+                'out_channels should be equal to num_classes,'
+                'except binary segmentation set out_channels == 1 and'
+                f'num_classes == 2, but got out_channels={out_channels}'
+                f'and num_classes={num_classes}')
+
+        if out_channels == 1 and threshold is None:
+            threshold = 0.3
+            warnings.warn('threshold is not defined for binary, and defaults'
+                          'to 0.3')
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        self.threshold = threshold
+
         if isinstance(loss_decode, dict):
             self.loss_decode = build_loss(loss_decode)
         elif isinstance(loss_decode, (list, tuple)):
@@ -125,7 +154,7 @@ def __init__(self,
         else:
             self.sampler = None
 
-        self.conv_seg = nn.Conv2d(channels, num_classes, kernel_size=1)
+        self.conv_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
         if dropout_ratio > 0:
             self.dropout = nn.Dropout2d(dropout_ratio)
         else:
@@ -234,7 +263,7 @@ def loss(self, inputs: Tuple[Tensor], batch_data_samples: SampleList,
         return losses
 
     def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
-                test_cfg: ConfigType) -> List[Tensor]:
+                test_cfg: ConfigType) -> Tensor:
         """Forward function for prediction.
 
         Args:
@@ -247,7 +276,7 @@ def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
             test_cfg (dict): The testing config.
 
         Returns:
-            List[Tensor]: Outputs segmentation logits map.
+            Tensor: Outputs segmentation logits map.
         """
         seg_logits = self.forward(inputs)
 
@@ -321,9 +350,17 @@ def predict_by_feat(self, seg_logits: Tensor,
             Tensor: Outputs segmentation logits map.
         """
 
+        if isinstance(batch_img_metas[0]['img_shape'], torch.Size):
+            # slide inference
+            size = batch_img_metas[0]['img_shape']
+        elif 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape'][:2]
+        else:
+            size = batch_img_metas[0]['img_shape']
+
         seg_logits = resize(
             input=seg_logits,
-            size=batch_img_metas[0]['img_shape'],
+            size=size,
             mode='bilinear',
             align_corners=self.align_corners)
         return seg_logits
diff --git a/mmseg/models/decode_heads/dm_head.py b/mmseg/models/decode_heads/dm_head.py
index 30405e3eb2..7694abd8ac 100644
--- a/mmseg/models/decode_heads/dm_head.py
+++ b/mmseg/models/decode_heads/dm_head.py
@@ -24,7 +24,7 @@ class DCM(nn.Module):
 
     def __init__(self, filter_size, fusion, in_channels, channels, conv_cfg,
                  norm_cfg, act_cfg):
-        super(DCM, self).__init__()
+        super().__init__()
         self.filter_size = filter_size
         self.fusion = fusion
         self.in_channels = in_channels
@@ -105,7 +105,7 @@ class DMHead(BaseDecodeHead):
     """
 
     def __init__(self, filter_sizes=(1, 3, 5, 7), fusion=False, **kwargs):
-        super(DMHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert isinstance(filter_sizes, (list, tuple))
         self.filter_sizes = filter_sizes
         self.fusion = fusion
diff --git a/mmseg/models/decode_heads/dnl_head.py b/mmseg/models/decode_heads/dnl_head.py
index 400a175562..248c118141 100644
--- a/mmseg/models/decode_heads/dnl_head.py
+++ b/mmseg/models/decode_heads/dnl_head.py
@@ -111,7 +111,7 @@ def __init__(self,
                  mode='embedded_gaussian',
                  temperature=0.05,
                  **kwargs):
-        super(DNLHead, self).__init__(num_convs=2, **kwargs)
+        super().__init__(num_convs=2, **kwargs)
         self.reduction = reduction
         self.use_scale = use_scale
         self.mode = mode
diff --git a/mmseg/models/decode_heads/dpt_head.py b/mmseg/models/decode_heads/dpt_head.py
index 04ade62cfa..d2cfd89daa 100644
--- a/mmseg/models/decode_heads/dpt_head.py
+++ b/mmseg/models/decode_heads/dpt_head.py
@@ -4,10 +4,10 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule, Linear, build_activation_layer
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .decode_head import BaseDecodeHead
 
 
@@ -30,7 +30,7 @@ def __init__(self,
                  readout_type='ignore',
                  patch_size=16,
                  init_cfg=None):
-        super(ReassembleBlocks, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         assert readout_type in ['ignore', 'add', 'project']
         self.readout_type = readout_type
@@ -116,7 +116,7 @@ def __init__(self,
                  stride=1,
                  dilation=1,
                  init_cfg=None):
-        super(PreActResidualConvUnit, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         self.conv1 = ConvModule(
             in_channels,
@@ -168,7 +168,7 @@ def __init__(self,
                  expand=False,
                  align_corners=True,
                  init_cfg=None):
-        super(FeatureFusionBlock, self).__init__(init_cfg)
+        super().__init__(init_cfg)
 
         self.in_channels = in_channels
         self.expand = expand
@@ -242,7 +242,7 @@ def __init__(self,
                  act_cfg=dict(type='ReLU'),
                  norm_cfg=dict(type='BN'),
                  **kwargs):
-        super(DPTHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
 
         self.in_channels = self.in_channels
         self.expand_channels = expand_channels
diff --git a/mmseg/models/decode_heads/ema_head.py b/mmseg/models/decode_heads/ema_head.py
index d7923f424e..ab8dbb0c29 100644
--- a/mmseg/models/decode_heads/ema_head.py
+++ b/mmseg/models/decode_heads/ema_head.py
@@ -30,7 +30,7 @@ class EMAModule(nn.Module):
     """
 
     def __init__(self, channels, num_bases, num_stages, momentum):
-        super(EMAModule, self).__init__()
+        super().__init__()
         assert num_stages >= 1, 'num_stages must be at least 1!'
         self.num_bases = num_bases
         self.num_stages = num_stages
@@ -99,7 +99,7 @@ def __init__(self,
                  concat_input=True,
                  momentum=0.1,
                  **kwargs):
-        super(EMAHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.ema_channels = ema_channels
         self.num_bases = num_bases
         self.num_stages = num_stages
diff --git a/mmseg/models/decode_heads/enc_head.py b/mmseg/models/decode_heads/enc_head.py
index 1b8eecbff3..ef48fb6995 100644
--- a/mmseg/models/decode_heads/enc_head.py
+++ b/mmseg/models/decode_heads/enc_head.py
@@ -7,10 +7,10 @@
 from mmcv.cnn import ConvModule, build_norm_layer
 from torch import Tensor
 
-from mmseg.ops import Encoding, resize
 from mmseg.registry import MODELS
 from mmseg.utils import ConfigType, SampleList
 from ..builder import build_loss
+from ..utils import Encoding, resize
 from .decode_head import BaseDecodeHead
 
 
@@ -26,7 +26,7 @@ class EncModule(nn.Module):
     """
 
     def __init__(self, in_channels, num_codes, conv_cfg, norm_cfg, act_cfg):
-        super(EncModule, self).__init__()
+        super().__init__()
         self.encoding_project = ConvModule(
             in_channels,
             in_channels,
@@ -90,8 +90,7 @@ def __init__(self,
                      use_sigmoid=True,
                      loss_weight=0.2),
                  **kwargs):
-        super(EncHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
+        super().__init__(input_transform='multiple_select', **kwargs)
         self.use_se_loss = use_se_loss
         self.add_lateral = add_lateral
         self.num_codes = num_codes
@@ -188,8 +187,7 @@ def loss_by_feat(self, seg_logit: Tuple[Tensor],
         """Compute segmentation and semantic encoding loss."""
         seg_logit, se_seg_logit = seg_logit
         loss = dict()
-        loss.update(
-            super(EncHead, self).loss_by_feat(seg_logit, batch_data_samples))
+        loss.update(super().loss_by_feat(seg_logit, batch_data_samples))
 
         seg_label = self._stack_batch_gt(batch_data_samples)
         se_loss = self.loss_se_decode(
diff --git a/mmseg/models/decode_heads/fcn_head.py b/mmseg/models/decode_heads/fcn_head.py
index 4e3b974a8b..3418018883 100644
--- a/mmseg/models/decode_heads/fcn_head.py
+++ b/mmseg/models/decode_heads/fcn_head.py
@@ -31,7 +31,7 @@ def __init__(self,
         self.num_convs = num_convs
         self.concat_input = concat_input
         self.kernel_size = kernel_size
-        super(FCNHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         if num_convs == 0:
             assert self.in_channels == self.channels
 
diff --git a/mmseg/models/decode_heads/fpn_head.py b/mmseg/models/decode_heads/fpn_head.py
index be92ceed55..25f481fe81 100644
--- a/mmseg/models/decode_heads/fpn_head.py
+++ b/mmseg/models/decode_heads/fpn_head.py
@@ -3,8 +3,8 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 
-from mmseg.ops import Upsample, resize
 from mmseg.registry import MODELS
+from ..utils import Upsample, resize
 from .decode_head import BaseDecodeHead
 
 
@@ -22,8 +22,7 @@ class FPNHead(BaseDecodeHead):
     """
 
     def __init__(self, feature_strides, **kwargs):
-        super(FPNHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
+        super().__init__(input_transform='multiple_select', **kwargs)
         assert len(feature_strides) == len(self.in_channels)
         assert min(feature_strides) == feature_strides[0]
         self.feature_strides = feature_strides
diff --git a/mmseg/models/decode_heads/gc_head.py b/mmseg/models/decode_heads/gc_head.py
index e89b92d8b0..14f0ef021c 100644
--- a/mmseg/models/decode_heads/gc_head.py
+++ b/mmseg/models/decode_heads/gc_head.py
@@ -26,7 +26,7 @@ def __init__(self,
                  pooling_type='att',
                  fusion_types=('channel_add', ),
                  **kwargs):
-        super(GCHead, self).__init__(num_convs=2, **kwargs)
+        super().__init__(num_convs=2, **kwargs)
         self.ratio = ratio
         self.pooling_type = pooling_type
         self.fusion_types = fusion_types
diff --git a/mmseg/models/decode_heads/ham_head.py b/mmseg/models/decode_heads/ham_head.py
new file mode 100644
index 0000000000..073d8011b0
--- /dev/null
+++ b/mmseg/models/decode_heads/ham_head.py
@@ -0,0 +1,255 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Originally from https://github.com/visual-attention-network/segnext
+# Licensed under the Apache License, Version 2.0 (the "License")
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.device import get_device
+
+from mmseg.registry import MODELS
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class Matrix_Decomposition_2D_Base(nn.Module):
+    """Base class of 2D Matrix Decomposition.
+
+    Args:
+        MD_S (int): The number of spatial coefficient in
+            Matrix Decomposition, it may be used for calculation
+            of the number of latent dimension D in Matrix
+            Decomposition. Defaults: 1.
+        MD_R (int): The number of latent dimension R in
+            Matrix Decomposition. Defaults: 64.
+        train_steps (int): The number of iteration steps in
+            Multiplicative Update (MU) rule to solve Non-negative
+            Matrix Factorization (NMF) in training. Defaults: 6.
+        eval_steps (int): The number of iteration steps in
+            Multiplicative Update (MU) rule to solve Non-negative
+            Matrix Factorization (NMF) in evaluation. Defaults: 7.
+        inv_t (int): Inverted multiple number to make coefficient
+            smaller in softmax. Defaults: 100.
+        rand_init (bool): Whether to initialize randomly.
+            Defaults: True.
+    """
+
+    def __init__(self,
+                 MD_S=1,
+                 MD_R=64,
+                 train_steps=6,
+                 eval_steps=7,
+                 inv_t=100,
+                 rand_init=True):
+        super().__init__()
+
+        self.S = MD_S
+        self.R = MD_R
+
+        self.train_steps = train_steps
+        self.eval_steps = eval_steps
+
+        self.inv_t = inv_t
+
+        self.rand_init = rand_init
+
+    def _build_bases(self, B, S, D, R, device=None):
+        raise NotImplementedError
+
+    def local_step(self, x, bases, coef):
+        raise NotImplementedError
+
+    def local_inference(self, x, bases):
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        coef = torch.bmm(x.transpose(1, 2), bases)
+        coef = F.softmax(self.inv_t * coef, dim=-1)
+
+        steps = self.train_steps if self.training else self.eval_steps
+        for _ in range(steps):
+            bases, coef = self.local_step(x, bases, coef)
+
+        return bases, coef
+
+    def compute_coef(self, x, bases, coef):
+        raise NotImplementedError
+
+    def forward(self, x, return_bases=False):
+        """Forward Function."""
+        B, C, H, W = x.shape
+
+        # (B, C, H, W) -> (B * S, D, N)
+        D = C // self.S
+        N = H * W
+        x = x.view(B * self.S, D, N)
+        if not self.rand_init and not hasattr(self, 'bases'):
+            bases = self._build_bases(1, self.S, D, self.R, device=x.device)
+            self.register_buffer('bases', bases)
+
+        # (S, D, R) -> (B * S, D, R)
+        if self.rand_init:
+            bases = self._build_bases(B, self.S, D, self.R, device=x.device)
+        else:
+            bases = self.bases.repeat(B, 1, 1)
+
+        bases, coef = self.local_inference(x, bases)
+
+        # (B * S, N, R)
+        coef = self.compute_coef(x, bases, coef)
+
+        # (B * S, D, R) @ (B * S, N, R)^T -> (B * S, D, N)
+        x = torch.bmm(bases, coef.transpose(1, 2))
+
+        # (B * S, D, N) -> (B, C, H, W)
+        x = x.view(B, C, H, W)
+
+        return x
+
+
+class NMF2D(Matrix_Decomposition_2D_Base):
+    """Non-negative Matrix Factorization (NMF) module.
+
+    It is inherited from ``Matrix_Decomposition_2D_Base`` module.
+    """
+
+    def __init__(self, args=dict()):
+        super().__init__(**args)
+
+        self.inv_t = 1
+
+    def _build_bases(self, B, S, D, R, device=None):
+        """Build bases in initialization."""
+        if device is None:
+            device = get_device()
+        bases = torch.rand((B * S, D, R)).to(device)
+        bases = F.normalize(bases, dim=1)
+
+        return bases
+
+    def local_step(self, x, bases, coef):
+        """Local step in iteration to renew bases and coefficient."""
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        numerator = torch.bmm(x.transpose(1, 2), bases)
+        # (B * S, N, R) @ [(B * S, D, R)^T @ (B * S, D, R)] -> (B * S, N, R)
+        denominator = coef.bmm(bases.transpose(1, 2).bmm(bases))
+        # Multiplicative Update
+        coef = coef * numerator / (denominator + 1e-6)
+
+        # (B * S, D, N) @ (B * S, N, R) -> (B * S, D, R)
+        numerator = torch.bmm(x, coef)
+        # (B * S, D, R) @ [(B * S, N, R)^T @ (B * S, N, R)] -> (B * S, D, R)
+        denominator = bases.bmm(coef.transpose(1, 2).bmm(coef))
+        # Multiplicative Update
+        bases = bases * numerator / (denominator + 1e-6)
+
+        return bases, coef
+
+    def compute_coef(self, x, bases, coef):
+        """Compute coefficient."""
+        # (B * S, D, N)^T @ (B * S, D, R) -> (B * S, N, R)
+        numerator = torch.bmm(x.transpose(1, 2), bases)
+        # (B * S, N, R) @ (B * S, D, R)^T @ (B * S, D, R) -> (B * S, N, R)
+        denominator = coef.bmm(bases.transpose(1, 2).bmm(bases))
+        # multiplication update
+        coef = coef * numerator / (denominator + 1e-6)
+
+        return coef
+
+
+class Hamburger(nn.Module):
+    """Hamburger Module. It consists of one slice of "ham" (matrix
+    decomposition) and two slices of "bread" (linear transformation).
+
+    Args:
+        ham_channels (int): Input and output channels of feature.
+        ham_kwargs (dict): Config of matrix decomposition module.
+        norm_cfg (dict | None): Config of norm layers.
+    """
+
+    def __init__(self,
+                 ham_channels=512,
+                 ham_kwargs=dict(),
+                 norm_cfg=None,
+                 **kwargs):
+        super().__init__()
+
+        self.ham_in = ConvModule(
+            ham_channels, ham_channels, 1, norm_cfg=None, act_cfg=None)
+
+        self.ham = NMF2D(ham_kwargs)
+
+        self.ham_out = ConvModule(
+            ham_channels, ham_channels, 1, norm_cfg=norm_cfg, act_cfg=None)
+
+    def forward(self, x):
+        enjoy = self.ham_in(x)
+        enjoy = F.relu(enjoy, inplace=True)
+        enjoy = self.ham(enjoy)
+        enjoy = self.ham_out(enjoy)
+        ham = F.relu(x + enjoy, inplace=True)
+
+        return ham
+
+
+@MODELS.register_module()
+class LightHamHead(BaseDecodeHead):
+    """SegNeXt decode head.
+
+    This decode head is the implementation of `SegNeXt: Rethinking
+    Convolutional Attention Design for Semantic
+    Segmentation <https://arxiv.org/abs/2209.08575>`_.
+    Inspiration from https://github.com/visual-attention-network/segnext.
+
+    Specifically, LightHamHead is inspired by HamNet from
+    `Is Attention Better Than Matrix Decomposition?
+    <https://arxiv.org/abs/2109.04553>`.
+
+    Args:
+        ham_channels (int): input channels for Hamburger.
+            Defaults: 512.
+        ham_kwargs (int): kwagrs for Ham. Defaults: dict().
+    """
+
+    def __init__(self, ham_channels=512, ham_kwargs=dict(), **kwargs):
+        super().__init__(input_transform='multiple_select', **kwargs)
+        self.ham_channels = ham_channels
+
+        self.squeeze = ConvModule(
+            sum(self.in_channels),
+            self.ham_channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.hamburger = Hamburger(ham_channels, ham_kwargs, **kwargs)
+
+        self.align = ConvModule(
+            self.ham_channels,
+            self.channels,
+            1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+    def forward(self, inputs):
+        """Forward function."""
+        inputs = self._transform_inputs(inputs)
+
+        inputs = [
+            resize(
+                level,
+                size=inputs[0].shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners) for level in inputs
+        ]
+
+        inputs = torch.cat(inputs, dim=1)
+        # apply a conv block to squeeze feature map
+        x = self.squeeze(inputs)
+        # apply hamburger module
+        x = self.hamburger(x)
+
+        # apply a conv block to align feature map
+        output = self.align(x)
+        output = self.cls_seg(output)
+        return output
diff --git a/mmseg/models/decode_heads/isa_head.py b/mmseg/models/decode_heads/isa_head.py
index 3769bdff4a..355f215f39 100644
--- a/mmseg/models/decode_heads/isa_head.py
+++ b/mmseg/models/decode_heads/isa_head.py
@@ -22,7 +22,7 @@ class SelfAttentionBlock(_SelfAttentionBlock):
     """
 
     def __init__(self, in_channels, channels, conv_cfg, norm_cfg, act_cfg):
-        super(SelfAttentionBlock, self).__init__(
+        super().__init__(
             key_in_channels=in_channels,
             query_in_channels=in_channels,
             channels=channels,
@@ -51,7 +51,7 @@ def __init__(self, in_channels, channels, conv_cfg, norm_cfg, act_cfg):
 
     def forward(self, x):
         """Forward function."""
-        context = super(SelfAttentionBlock, self).forward(x, x)
+        context = super().forward(x, x)
         return self.output_project(context)
 
 
@@ -68,7 +68,7 @@ class ISAHead(BaseDecodeHead):
     """
 
     def __init__(self, isa_channels, down_factor=(8, 8), **kwargs):
-        super(ISAHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.down_factor = down_factor
 
         self.in_conv = ConvModule(
diff --git a/mmseg/models/decode_heads/knet_head.py b/mmseg/models/decode_heads/knet_head.py
index 3f7310cb7e..82d3a28076 100644
--- a/mmseg/models/decode_heads/knet_head.py
+++ b/mmseg/models/decode_heads/knet_head.py
@@ -5,8 +5,7 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
-from mmcv.cnn.bricks.transformer import (FFN, TRANSFORMER_LAYER,
-                                         MultiheadAttention,
+from mmcv.cnn.bricks.transformer import (FFN, MultiheadAttention,
                                          build_transformer_layer)
 from mmengine.logging import print_log
 from torch import Tensor
@@ -16,7 +15,7 @@
 from mmseg.utils import SampleList
 
 
-@TRANSFORMER_LAYER.register_module()
+@MODELS.register_module()
 class KernelUpdator(nn.Module):
     """Dynamic Kernel Updator in Kernel Update Head.
 
@@ -49,7 +48,7 @@ def __init__(
             norm_cfg=dict(type='LN'),
             act_cfg=dict(type='ReLU', inplace=True),
     ):
-        super(KernelUpdator, self).__init__()
+        super().__init__()
         self.in_channels = in_channels
         self.feat_channels = feat_channels
         self.out_channels_raw = out_channels
@@ -214,7 +213,7 @@ def __init__(self,
                      out_channels=256,
                      act_cfg=dict(type='ReLU', inplace=True),
                      norm_cfg=dict(type='LN'))):
-        super(KernelUpdateHead, self).__init__()
+        super().__init__()
         self.num_classes = num_classes
         self.in_channels = in_channels
         self.out_channels = out_channels
@@ -414,6 +413,9 @@ class IterativeDecodeHead(BaseDecodeHead):
 
     def __init__(self, num_stages, kernel_generate_head, kernel_update_head,
                  **kwargs):
+        # ``IterativeDecodeHead`` would skip initialization of
+        # ``BaseDecodeHead`` which would be called when building
+        # ``self.kernel_generate_head``.
         super(BaseDecodeHead, self).__init__(**kwargs)
         assert num_stages == len(kernel_update_head)
         self.num_stages = num_stages
@@ -423,6 +425,7 @@ def __init__(self, num_stages, kernel_generate_head, kernel_update_head,
         self.num_classes = self.kernel_generate_head.num_classes
         self.input_transform = self.kernel_generate_head.input_transform
         self.ignore_index = self.kernel_generate_head.ignore_index
+        self.out_channels = self.num_classes
 
         for head_cfg in kernel_update_head:
             self.kernel_update_head.append(MODELS.build(head_cfg))
diff --git a/mmseg/models/decode_heads/lraspp_head.py b/mmseg/models/decode_heads/lraspp_head.py
index 36999f056c..ba2465f275 100644
--- a/mmseg/models/decode_heads/lraspp_head.py
+++ b/mmseg/models/decode_heads/lraspp_head.py
@@ -1,11 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn as nn
-from mmcv import is_tuple_of
 from mmcv.cnn import ConvModule
+from mmengine.utils import is_tuple_of
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .decode_head import BaseDecodeHead
 
 
@@ -22,7 +22,7 @@ class LRASPPHead(BaseDecodeHead):
     """
 
     def __init__(self, branch_channels=(32, 64), **kwargs):
-        super(LRASPPHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         if self.input_transform != 'multiple_select':
             raise ValueError('in Lite R-ASPP (LRASPP) head, input_transform '
                              f'must be \'multiple_select\'. But received '
diff --git a/mmseg/models/decode_heads/mask2former_head.py b/mmseg/models/decode_heads/mask2former_head.py
new file mode 100644
index 0000000000..0135af0645
--- /dev/null
+++ b/mmseg/models/decode_heads/mask2former_head.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+
+try:
+    from mmdet.models.dense_heads import \
+        Mask2FormerHead as MMDET_Mask2FormerHead
+except ModuleNotFoundError:
+    MMDET_Mask2FormerHead = BaseModule
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class Mask2FormerHead(MMDET_Mask2FormerHead):
+    """Implements the Mask2Former head.
+
+    See `Mask2Former: Masked-attention Mask Transformer for Universal Image
+    Segmentation <https://arxiv.org/abs/2112.01527>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes,
+                 align_corners=False,
+                 ignore_index=255,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_Mask2FormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+
+        for data_sample in batch_data_samples:
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros(
+                    (0, gt_sem_seg.shape[-2],
+                     gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
+            else:
+                gt_masks = torch.stack(masks).squeeze(1).long()
+
+            instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+        batch_data_samples = [
+            SegDataSample(metainfo=metainfo) for metainfo in batch_img_metas
+        ]
+
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+        if 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape']
+        else:
+            size = batch_img_metas[0]['img_shape']
+        # upsample mask
+        mask_pred_results = F.interpolate(
+            mask_pred_results, size=size, mode='bilinear', align_corners=False)
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc, bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
diff --git a/mmseg/models/decode_heads/maskformer_head.py b/mmseg/models/decode_heads/maskformer_head.py
new file mode 100644
index 0000000000..6e61a7f63a
--- /dev/null
+++ b/mmseg/models/decode_heads/maskformer_head.py
@@ -0,0 +1,174 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.model import BaseModule
+
+try:
+    from mmdet.models.dense_heads import MaskFormerHead as MMDET_MaskFormerHead
+except ModuleNotFoundError:
+    MMDET_MaskFormerHead = BaseModule
+
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures.seg_data_sample import SegDataSample
+from mmseg.utils import ConfigType, SampleList
+
+
+@MODELS.register_module()
+class MaskFormerHead(MMDET_MaskFormerHead):
+    """Implements the MaskFormer head.
+
+    See `Per-Pixel Classification is Not All You Need for Semantic Segmentation
+    <https://arxiv.org/pdf/2107.06278>`_ for details.
+
+    Args:
+        num_classes (int): Number of classes. Default: 150.
+        align_corners (bool): align_corners argument of F.interpolate.
+            Default: False.
+        ignore_index (int): The label index to be ignored. Default: 255.
+    """
+
+    def __init__(self,
+                 num_classes: int = 150,
+                 align_corners: bool = False,
+                 ignore_index: int = 255,
+                 **kwargs) -> None:
+        super().__init__(**kwargs)
+
+        self.out_channels = kwargs['out_channels']
+        self.align_corners = True
+        self.num_classes = num_classes
+        self.align_corners = align_corners
+        self.out_channels = num_classes
+        self.ignore_index = ignore_index
+
+        feat_channels = kwargs['feat_channels']
+        self.cls_embed = nn.Linear(feat_channels, self.num_classes + 1)
+
+    def _seg_data_to_instance_data(self, batch_data_samples: SampleList):
+        """Perform forward propagation to convert paradigm from MMSegmentation
+        to MMDetection to ensure ``MMDET_MaskFormerHead`` could be called
+        normally. Specifically, ``batch_gt_instances`` would be added.
+
+        Args:
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+
+        Returns:
+            tuple[Tensor]: A tuple contains two lists.
+
+                - batch_gt_instances (list[:obj:`InstanceData`]): Batch of
+                    gt_instance. It usually includes ``labels``, each is
+                    unique ground truth label id of images, with
+                    shape (num_gt, ) and ``masks``, each is ground truth
+                    masks of each instances of a image, shape (num_gt, h, w).
+                - batch_img_metas (list[dict]): List of image meta information.
+        """
+        batch_img_metas = []
+        batch_gt_instances = []
+        for data_sample in batch_data_samples:
+            # Add `batch_input_shape` in metainfo of data_sample, which would
+            # be used in MaskFormerHead of MMDetection.
+            metainfo = data_sample.metainfo
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            data_sample.set_metainfo(metainfo)
+            batch_img_metas.append(data_sample.metainfo)
+            gt_sem_seg = data_sample.gt_sem_seg.data
+            classes = torch.unique(
+                gt_sem_seg,
+                sorted=False,
+                return_inverse=False,
+                return_counts=False)
+
+            # remove ignored region
+            gt_labels = classes[classes != self.ignore_index]
+
+            masks = []
+            for class_id in gt_labels:
+                masks.append(gt_sem_seg == class_id)
+
+            if len(masks) == 0:
+                gt_masks = torch.zeros((0, gt_sem_seg.shape[-2],
+                                        gt_sem_seg.shape[-1])).to(gt_sem_seg)
+            else:
+                gt_masks = torch.stack(masks).squeeze(1)
+
+            instance_data = InstanceData(
+                labels=gt_labels, masks=gt_masks.long())
+            batch_gt_instances.append(instance_data)
+        return batch_gt_instances, batch_img_metas
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances, batch_img_metas = self._seg_data_to_instance_data(
+            batch_data_samples)
+
+        # forward
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+
+        # loss
+        losses = self.loss_by_feat(all_cls_scores, all_mask_preds,
+                                   batch_gt_instances, batch_img_metas)
+
+        return losses
+
+    def predict(self, x: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tuple[Tensor]:
+        """Test without augmentaton.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the
+                upstream network, each is a 4D-tensor.
+            batch_img_metas (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            test_cfg (ConfigType): Test config.
+
+        Returns:
+            Tensor: A tensor of segmentation mask.
+        """
+
+        batch_data_samples = []
+        for metainfo in batch_img_metas:
+            metainfo['batch_input_shape'] = metainfo['img_shape']
+            batch_data_samples.append(SegDataSample(metainfo=metainfo))
+        # Forward function of MaskFormerHead from MMDetection needs
+        # 'batch_data_samples' as inputs, which is image shape　actually.
+        all_cls_scores, all_mask_preds = self(x, batch_data_samples)
+        mask_cls_results = all_cls_scores[-1]
+        mask_pred_results = all_mask_preds[-1]
+
+        # upsample masks
+        img_shape = batch_img_metas[0]['batch_input_shape']
+        mask_pred_results = F.interpolate(
+            mask_pred_results,
+            size=img_shape,
+            mode='bilinear',
+            align_corners=False)
+
+        # semantic inference
+        cls_score = F.softmax(mask_cls_results, dim=-1)[..., :-1]
+        mask_pred = mask_pred_results.sigmoid()
+        seg_logits = torch.einsum('bqc,bqhw->bchw', cls_score, mask_pred)
+        return seg_logits
diff --git a/mmseg/models/decode_heads/nl_head.py b/mmseg/models/decode_heads/nl_head.py
index 7903f1ace8..0ffcc2a2f0 100644
--- a/mmseg/models/decode_heads/nl_head.py
+++ b/mmseg/models/decode_heads/nl_head.py
@@ -26,7 +26,7 @@ def __init__(self,
                  use_scale=True,
                  mode='embedded_gaussian',
                  **kwargs):
-        super(NLHead, self).__init__(num_convs=2, **kwargs)
+        super().__init__(num_convs=2, **kwargs)
         self.reduction = reduction
         self.use_scale = use_scale
         self.mode = mode
diff --git a/mmseg/models/decode_heads/ocr_head.py b/mmseg/models/decode_heads/ocr_head.py
index ce3582413a..9afe37bebd 100644
--- a/mmseg/models/decode_heads/ocr_head.py
+++ b/mmseg/models/decode_heads/ocr_head.py
@@ -4,9 +4,9 @@
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
 from ..utils import SelfAttentionBlock as _SelfAttentionBlock
+from ..utils import resize
 from .cascade_decode_head import BaseCascadeDecodeHead
 
 
@@ -18,7 +18,7 @@ class SpatialGatherModule(nn.Module):
     """
 
     def __init__(self, scale):
-        super(SpatialGatherModule, self).__init__()
+        super().__init__()
         self.scale = scale
 
     def forward(self, feats, probs):
@@ -46,7 +46,7 @@ def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg,
             query_downsample = nn.MaxPool2d(kernel_size=scale)
         else:
             query_downsample = None
-        super(ObjectAttentionBlock, self).__init__(
+        super().__init__(
             key_in_channels=in_channels,
             query_in_channels=in_channels,
             channels=channels,
@@ -73,8 +73,7 @@ def __init__(self, in_channels, channels, scale, conv_cfg, norm_cfg,
 
     def forward(self, query_feats, key_feats):
         """Forward function."""
-        context = super(ObjectAttentionBlock,
-                        self).forward(query_feats, key_feats)
+        context = super().forward(query_feats, key_feats)
         output = self.bottleneck(torch.cat([context, query_feats], dim=1))
         if self.query_downsample is not None:
             output = resize(query_feats)
@@ -96,7 +95,7 @@ class OCRHead(BaseCascadeDecodeHead):
     """
 
     def __init__(self, ocr_channels, scale=1, **kwargs):
-        super(OCRHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.ocr_channels = ocr_channels
         self.scale = scale
         self.object_context_block = ObjectAttentionBlock(
diff --git a/mmseg/models/decode_heads/pid_head.py b/mmseg/models/decode_heads/pid_head.py
new file mode 100644
index 0000000000..c092cb32d0
--- /dev/null
+++ b/mmseg/models/decode_heads/pid_head.py
@@ -0,0 +1,183 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Tuple, Union
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_activation_layer, build_norm_layer
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.models.losses import accuracy
+from mmseg.models.utils import resize
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType, SampleList
+
+
+class BasePIDHead(BaseModule):
+    """Base class for PID head.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+        init_cfg (dict or list[dict], optional): Init config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv = ConvModule(
+            in_channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            order=('norm', 'act', 'conv'))
+        _, self.norm = build_norm_layer(norm_cfg, num_features=channels)
+        self.act = build_activation_layer(act_cfg)
+
+    def forward(self, x: Tensor, cls_seg: Optional[nn.Module]) -> Tensor:
+        """Forward function.
+        Args:
+            x (Tensor): Input tensor.
+            cls_seg (nn.Module, optional): The classification head.
+
+        Returns:
+            Tensor: Output tensor.
+        """
+        x = self.conv(x)
+        x = self.norm(x)
+        x = self.act(x)
+        if cls_seg is not None:
+            x = cls_seg(x)
+        return x
+
+
+@MODELS.register_module()
+class PIDHead(BaseDecodeHead):
+    """Decode head for PIDNet.
+
+    Args:
+        in_channels (int): Number of input channels.
+        channels (int): Number of output channels.
+        num_classes (int): Number of classes.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU', inplace=True).
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 num_classes: int,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 **kwargs):
+        super().__init__(
+            in_channels,
+            channels,
+            num_classes=num_classes,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **kwargs)
+        self.i_head = BasePIDHead(in_channels, channels, norm_cfg, act_cfg)
+        self.p_head = BasePIDHead(in_channels // 2, channels, norm_cfg,
+                                  act_cfg)
+        self.d_head = BasePIDHead(
+            in_channels // 2,
+            in_channels // 4,
+            norm_cfg,
+        )
+        self.p_cls_seg = nn.Conv2d(channels, self.out_channels, kernel_size=1)
+        self.d_cls_seg = nn.Conv2d(in_channels // 4, 1, kernel_size=1)
+
+    def init_weights(self):
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(
+                    m.weight, mode='fan_out', nonlinearity='relu')
+            elif isinstance(m, nn.BatchNorm2d):
+                nn.init.constant_(m.weight, 1)
+                nn.init.constant_(m.bias, 0)
+
+    def forward(
+            self,
+            inputs: Union[Tensor,
+                          Tuple[Tensor]]) -> Union[Tensor, Tuple[Tensor]]:
+        """Forward function.
+        Args:
+            inputs (Tensor | tuple[Tensor]): Input tensor or tuple of
+                Tensor. When training, the input is a tuple of three tensors,
+                (p_feat, i_feat, d_feat), and the output is a tuple of three
+                tensors, (p_seg_logit, i_seg_logit, d_seg_logit).
+                When inference, only the head of integral branch is used, and
+                input is a tensor of integral feature map, and the output is
+                the segmentation logit.
+
+        Returns:
+            Tensor | tuple[Tensor]: Output tensor or tuple of tensors.
+        """
+        if self.training:
+            x_p, x_i, x_d = inputs
+            x_p = self.p_head(x_p, self.p_cls_seg)
+            x_i = self.i_head(x_i, self.cls_seg)
+            x_d = self.d_head(x_d, self.d_cls_seg)
+            return x_p, x_i, x_d
+        else:
+            return self.i_head(inputs, self.cls_seg)
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tuple[Tensor]:
+        gt_semantic_segs = [
+            data_sample.gt_sem_seg.data for data_sample in batch_data_samples
+        ]
+        gt_edge_segs = [
+            data_sample.gt_edge_map.data for data_sample in batch_data_samples
+        ]
+        gt_sem_segs = torch.stack(gt_semantic_segs, dim=0)
+        gt_edge_segs = torch.stack(gt_edge_segs, dim=0)
+        return gt_sem_segs, gt_edge_segs
+
+    def loss_by_feat(self, seg_logits: Tuple[Tensor],
+                     batch_data_samples: SampleList) -> dict:
+        loss = dict()
+        p_logit, i_logit, d_logit = seg_logits
+        sem_label, bd_label = self._stack_batch_gt(batch_data_samples)
+        p_logit = resize(
+            input=p_logit,
+            size=sem_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        i_logit = resize(
+            input=i_logit,
+            size=sem_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        d_logit = resize(
+            input=d_logit,
+            size=bd_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        sem_label = sem_label.squeeze(1)
+        bd_label = bd_label.squeeze(1)
+        loss['loss_sem_p'] = self.loss_decode[0](
+            p_logit, sem_label, ignore_index=self.ignore_index)
+        loss['loss_sem_i'] = self.loss_decode[1](i_logit, sem_label)
+        loss['loss_bd'] = self.loss_decode[2](d_logit, bd_label)
+        filler = torch.ones_like(sem_label) * self.ignore_index
+        sem_bd_label = torch.where(
+            torch.sigmoid(d_logit[:, 0, :, :]) > 0.8, sem_label, filler)
+        loss['loss_sem_bd'] = self.loss_decode[3](i_logit, sem_bd_label)
+        loss['acc_seg'] = accuracy(
+            i_logit, sem_label, ignore_index=self.ignore_index)
+        return loss
diff --git a/mmseg/models/decode_heads/point_head.py b/mmseg/models/decode_heads/point_head.py
index 781ed1ee8c..e8e433d662 100644
--- a/mmseg/models/decode_heads/point_head.py
+++ b/mmseg/models/decode_heads/point_head.py
@@ -12,10 +12,10 @@
 
 from typing import List
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
 from mmseg.utils import SampleList
 from ..losses import accuracy
+from ..utils import resize
 from .cascade_decode_head import BaseCascadeDecodeHead
 
 
@@ -74,7 +74,7 @@ def __init__(self,
                  norm_cfg=None,
                  act_cfg=dict(type='ReLU', inplace=False),
                  **kwargs):
-        super(PointHead, self).__init__(
+        super().__init__(
             input_transform='multiple_select',
             conv_cfg=conv_cfg,
             norm_cfg=norm_cfg,
diff --git a/mmseg/models/decode_heads/psa_head.py b/mmseg/models/decode_heads/psa_head.py
index 4b292c600f..13ee5c58a5 100644
--- a/mmseg/models/decode_heads/psa_head.py
+++ b/mmseg/models/decode_heads/psa_head.py
@@ -4,8 +4,8 @@
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .decode_head import BaseDecodeHead
 
 try:
@@ -43,7 +43,7 @@ def __init__(self,
                  **kwargs):
         if PSAMask is None:
             raise RuntimeError('Please install mmcv-full for PSAMask ops')
-        super(PSAHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert psa_type in ['collect', 'distribute', 'bi-direction']
         self.psa_type = psa_type
         self.compact = compact
diff --git a/mmseg/models/decode_heads/psp_head.py b/mmseg/models/decode_heads/psp_head.py
index 734de8f1a0..a40ec41dec 100644
--- a/mmseg/models/decode_heads/psp_head.py
+++ b/mmseg/models/decode_heads/psp_head.py
@@ -3,8 +3,8 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .decode_head import BaseDecodeHead
 
 
@@ -24,7 +24,7 @@ class PPM(nn.ModuleList):
 
     def __init__(self, pool_scales, in_channels, channels, conv_cfg, norm_cfg,
                  act_cfg, align_corners, **kwargs):
-        super(PPM, self).__init__()
+        super().__init__()
         self.pool_scales = pool_scales
         self.align_corners = align_corners
         self.in_channels = in_channels
@@ -72,7 +72,7 @@ class PSPHead(BaseDecodeHead):
     """
 
     def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
-        super(PSPHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert isinstance(pool_scales, (list, tuple))
         self.pool_scales = pool_scales
         self.psp_modules = PPM(
diff --git a/mmseg/models/decode_heads/san_head.py b/mmseg/models/decode_heads/san_head.py
new file mode 100644
index 0000000000..d20da80192
--- /dev/null
+++ b/mmseg/models/decode_heads/san_head.py
@@ -0,0 +1,736 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from functools import partial
+from typing import Dict, List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule, build_norm_layer
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from mmcv.ops import point_sample
+from mmengine.dist import all_reduce
+from mmengine.model.weight_init import (caffe2_xavier_init, normal_init,
+                                        trunc_normal_)
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from mmengine.structures import InstanceData
+from torch import Tensor
+from torch.nn import functional as F
+
+from mmseg.models.backbones.vit import TransformerEncoderLayer
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, MatchMasks, SampleList,
+                         seg_data_to_instance_data)
+from ..utils import (MLP, LayerNorm2d, PatchEmbed, cross_attn_layer,
+                     get_uncertain_point_coords_with_randomness, resize)
+from .decode_head import BaseDecodeHead
+
+
+class MLPMaskDecoder(nn.Module):
+    """Module for decoding query and visual features with MLP layers to
+    generate the attention biases and the mask proposals."""
+
+    def __init__(
+        self,
+        *,
+        in_channels: int,
+        total_heads: int = 1,
+        total_layers: int = 1,
+        embed_channels: int = 256,
+        mlp_channels: int = 256,
+        mlp_num_layers: int = 3,
+        rescale_attn_bias: bool = False,
+    ):
+        super().__init__()
+        self.total_heads = total_heads
+        self.total_layers = total_layers
+
+        dense_affine_func = partial(nn.Conv2d, kernel_size=1)
+        # Query Branch
+        self.query_mlp = MLP(in_channels, mlp_channels, embed_channels,
+                             mlp_num_layers)
+        # Pixel Branch
+        self.pix_mlp = MLP(
+            in_channels,
+            mlp_channels,
+            embed_channels,
+            mlp_num_layers,
+            affine_func=dense_affine_func,
+        )
+        # Attention Bias Branch
+        self.attn_mlp = MLP(
+            in_channels,
+            mlp_channels,
+            embed_channels * self.total_heads * self.total_layers,
+            mlp_num_layers,
+            affine_func=dense_affine_func,
+        )
+        if rescale_attn_bias:
+            self.bias_scaling = nn.Linear(1, 1)
+        else:
+            self.bias_scaling = nn.Identity()
+
+    def forward(self, query: torch.Tensor,
+                x: torch.Tensor) -> Tuple[torch.Tensor, List[torch.Tensor]]:
+        """Forward function.
+        Args:
+            query (Tensor): Query Tokens [B,N,C].
+            x (Tensor): Visual features [B,C,H,W]
+
+        Return:
+            mask_preds (Tensor): Mask proposals.
+            attn_bias (List[Tensor]): List of attention bias.
+        """
+        query = self.query_mlp(query)
+        pix = self.pix_mlp(x)
+        b, c, h, w = pix.shape
+        # preidict mask
+        mask_preds = torch.einsum('bqc,bchw->bqhw', query, pix)
+        # generate attn bias
+        attn = self.attn_mlp(x)
+        attn = attn.reshape(b, self.total_layers, self.total_heads, c, h, w)
+        attn_bias = torch.einsum('bqc,blnchw->blnqhw', query, attn)
+        attn_bias = self.bias_scaling(attn_bias[..., None]).squeeze(-1)
+        attn_bias = attn_bias.chunk(self.total_layers, dim=1)
+        attn_bias = [attn.squeeze(1) for attn in attn_bias]
+        return mask_preds, attn_bias
+
+
+class SideAdapterNetwork(nn.Module):
+    """Side Adapter Network for predicting mask proposals and attention bias.
+
+    Args:
+        in_channels (int): Number of input channels. Default: 3.
+        clip_channels (int): Number of channels of visual features.
+            Default: 768.
+        embed_dims (int): embedding dimension. Default: 240.
+        patch_size (int): The patch size. Default: 16.
+        patch_bias (bool): Whether use bias in patch embedding.
+            Default: True.
+        num_queries (int): Number of queries for mask proposals.
+            Default: 100.
+        fusion_index (List[int]): The layer number of the encode
+            transformer to fuse with the CLIP feature.
+            Default: [0, 1, 2, 3].
+        cfg_encoder (ConfigType): Configs for the encode layers.
+        cfg_decoder (ConfigType): Configs for the decode layers.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+    """
+
+    def __init__(
+            self,
+            in_channels: int = 3,
+            clip_channels: int = 768,
+            embed_dims: int = 240,
+            patch_size: int = 16,
+            patch_bias: bool = True,
+            num_queries: int = 100,
+            fusion_index: list = [0, 1, 2, 3],
+            cfg_encoder: ConfigType = ...,
+            cfg_decoder: ConfigType = ...,
+            norm_cfg: dict = dict(type='LN'),
+    ):
+        super().__init__()
+
+        self.patch_embed = PatchEmbed(
+            in_channels=in_channels,
+            embed_dims=embed_dims,
+            conv_type='Conv2d',
+            kernel_size=patch_size,
+            stride=patch_size,
+            padding=0,
+            input_size=(640, 640),
+            bias=patch_bias,
+            norm_cfg=None,
+            init_cfg=None,
+        )
+        ori_h, ori_w = self.patch_embed.init_out_size
+        num_patches = ori_h * ori_w
+        self.pos_embed = nn.Parameter(
+            torch.randn(1, num_patches, embed_dims) * .02)
+        self.query_pos_embed = nn.Parameter(
+            torch.zeros(1, num_queries, embed_dims))
+        self.query_embed = nn.Parameter(
+            torch.zeros(1, num_queries, embed_dims))
+        encode_layers = []
+        for i in range(cfg_encoder.num_encode_layer):
+            encode_layers.append(
+                TransformerEncoderLayer(
+                    embed_dims=embed_dims,
+                    num_heads=cfg_encoder.num_heads,
+                    feedforward_channels=cfg_encoder.mlp_ratio * embed_dims,
+                    norm_cfg=norm_cfg))
+        self.encode_layers = nn.ModuleList(encode_layers)
+        conv_clips = []
+        for i in range(len(fusion_index)):
+            conv_clips.append(
+                nn.Sequential(
+                    LayerNorm2d(clip_channels),
+                    ConvModule(
+                        clip_channels,
+                        embed_dims,
+                        kernel_size=1,
+                        norm_cfg=None,
+                        act_cfg=None)))
+        self.conv_clips = nn.ModuleList(conv_clips)
+        self.fusion_index = fusion_index
+        self.mask_decoder = MLPMaskDecoder(
+            in_channels=embed_dims,
+            total_heads=cfg_decoder.num_heads,
+            total_layers=cfg_decoder.num_layers,
+            embed_channels=cfg_decoder.embed_channels,
+            mlp_channels=cfg_decoder.mlp_channels,
+            mlp_num_layers=cfg_decoder.num_mlp,
+            rescale_attn_bias=cfg_decoder.rescale)
+
+    def init_weights(self):
+        trunc_normal_(self.pos_embed, std=0.02)
+        nn.init.normal_(self.query_embed, std=0.02)
+        nn.init.normal_(self.query_pos_embed, std=0.02)
+        for i in range(len(self.conv_clips)):
+            caffe2_xavier_init(self.conv_clips[i][1].conv)
+
+    def fuse_clip(self, fused_index: int, x: torch.Tensor,
+                  clip_feature: torch.Tensor, hwshape: Tuple[int,
+                                                             int], L: int):
+        """Fuse CLIP feature and visual tokens."""
+        fused_clip = (resize(
+            self.conv_clips[fused_index](clip_feature.contiguous()),
+            size=hwshape,
+            mode='bilinear',
+            align_corners=False)).permute(0, 2, 3, 1).reshape(x[:, -L:,
+                                                                ...].shape)
+        x = torch.cat([x[:, :-L, ...], x[:, -L:, ...] + fused_clip], dim=1)
+        return x
+
+    def encode_feature(self, image: torch.Tensor,
+                       clip_features: List[torch.Tensor],
+                       deep_supervision_idxs: List[int]) -> List[List]:
+        """Encode images by a lightweight vision transformer."""
+        assert len(self.fusion_index) == len(clip_features)
+        x, hwshape = self.patch_embed(image)
+        ori_h, ori_w = self.patch_embed.init_out_size
+        pos_embed = self.pos_embed
+        if self.pos_embed.shape[1] != x.shape[1]:
+            # resize the position embedding
+            pos_embed = (
+                resize(
+                    self.pos_embed.reshape(1, ori_h, ori_w,
+                                           -1).permute(0, 3, 1, 2),
+                    size=hwshape,
+                    mode='bicubic',
+                    align_corners=False,
+                ).flatten(2).permute(0, 2, 1))
+        pos_embed = torch.cat([
+            self.query_pos_embed.expand(pos_embed.shape[0], -1, -1), pos_embed
+        ],
+                              dim=1)
+        x = torch.cat([self.query_embed.expand(x.shape[0], -1, -1), x], dim=1)
+        x = x + pos_embed
+        L = hwshape[0] * hwshape[1]
+        fused_index = 0
+        if self.fusion_index[fused_index] == 0:
+            x = self.fuse_clip(fused_index, x, clip_features[0][0], hwshape, L)
+            fused_index += 1
+        outs = []
+        for index, block in enumerate(self.encode_layers, start=1):
+            x = block(x)
+            if index < len(self.fusion_index
+                           ) and index == self.fusion_index[fused_index]:
+                x = self.fuse_clip(fused_index, x,
+                                   clip_features[fused_index][0], hwshape, L)
+                fused_index += 1
+            x_query = x[:, :-L, ...]
+            x_feat = x[:, -L:, ...].permute(0, 2, 1)\
+                .reshape(x.shape[0], x.shape[-1], hwshape[0], hwshape[1])
+
+            if index in deep_supervision_idxs or index == len(
+                    self.encode_layers):
+                outs.append({'query': x_query, 'x': x_feat})
+
+            if index < len(self.encode_layers):
+                x = x + pos_embed
+        return outs
+
+    def decode_feature(self, features):
+        mask_embeds = []
+        attn_biases = []
+        for feature in features:
+            mask_embed, attn_bias = self.mask_decoder(**feature)
+            mask_embeds.append(mask_embed)
+            attn_biases.append(attn_bias)
+        return mask_embeds, attn_biases
+
+    def forward(
+        self, image: torch.Tensor, clip_features: List[torch.Tensor],
+        deep_supervision_idxs: List[int]
+    ) -> Tuple[List[torch.Tensor], List[List[torch.Tensor]]]:
+        """Forward function."""
+        features = self.encode_feature(image, clip_features,
+                                       deep_supervision_idxs)
+        mask_embeds, attn_biases = self.decode_feature(features)
+        return mask_embeds, attn_biases
+
+
+class RecWithAttnbias(nn.Module):
+    """Mask recognition module by applying the attention biases to rest deeper
+    CLIP layers.
+
+    Args:
+        sos_token_format (str): The format of sos token. It should be
+            chosen from  ["cls_token", "learnable_token", "pos_embedding"].
+            Default: 'cls_token'.
+        sos_token_num (int): Number of sos token. It should be equal to
+            the number of quries. Default: 100.
+        num_layers (int): Number of rest CLIP layers for mask recognition.
+            Default: 3.
+        cross_attn (bool): Whether use cross attention to update sos token.
+            Default: False.
+        embed_dims (int): The feature dimension of CLIP layers.
+            Default: 768.
+        num_heads (int): Parallel attention heads of CLIP layers.
+            Default: 768.
+        mlp_ratio (int): Ratio of mlp hidden dim to embedding dim.
+            Default: 4.
+        qkv_bias (bool): Whether to use bias in multihead-attention.
+            Default: True.
+        out_dims (int): Number of channels of the output mask proposals.
+            It should be equal to the out_dims of text_encoder.
+            Default: 512.
+        final_norm (True): Whether use norm layer for sos token.
+        act_cfg (dict): The activation config for FFNs.
+            Default: dict(type='GELU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN').
+        frozen_exclude (List): List of parameters that are not to be frozen.
+    """
+
+    def __init__(self,
+                 sos_token_format: str = 'cls_token',
+                 sos_token_num: int = 100,
+                 num_layers: int = 3,
+                 cross_attn: bool = False,
+                 embed_dims: int = 768,
+                 num_heads: int = 12,
+                 mlp_ratio: int = 4,
+                 num_fcs: int = 2,
+                 qkv_bias: bool = True,
+                 out_dims: int = 512,
+                 final_norm: bool = True,
+                 act_cfg: dict = dict(type='GELU'),
+                 norm_cfg: dict = dict(type='LN'),
+                 frozen_exclude: List = []):
+        super().__init__()
+
+        assert sos_token_format in [
+            'cls_token', 'learnable_token', 'pos_embedding'
+        ]
+        self.sos_token_format = sos_token_format
+        self.sos_token_num = sos_token_num
+        self.frozen_exclude = frozen_exclude
+        self.cross_attn = cross_attn
+        self.num_layers = num_layers
+        self.num_heads = num_heads
+        if sos_token_format in ['learnable_token', 'pos_embedding']:
+            self.sos_token = nn.Parameter(
+                torch.randn(sos_token_num, 1, self.proj.shape[0]))
+            self.frozen.append('sos_token')
+
+        layers = []
+        for i in range(num_layers):
+            layers.append(
+                BaseTransformerLayer(
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=embed_dims,
+                        num_heads=num_heads,
+                        batch_first=False,
+                        bias=qkv_bias),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=embed_dims,
+                        feedforward_channels=mlp_ratio * embed_dims,
+                        act_cfg=act_cfg),
+                    operation_order=('norm', 'self_attn', 'norm', 'ffn')))
+        self.layers = nn.ModuleList(layers)
+
+        self.ln_post = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.proj = nn.Linear(embed_dims, out_dims, bias=False)
+
+        self.final_norm = final_norm
+        self._freeze()
+
+    def init_weights(self, rec_state_dict):
+        if hasattr(self, 'sos_token'):
+            normal_init(self.sos_token, std=0.02)
+        if rec_state_dict is not None:
+            load_state_dict(self, rec_state_dict, strict=False, logger=None)
+        else:
+            super().init_weights()
+
+    def _freeze(self):
+        if 'all' in self.frozen_exclude:
+            return
+        for name, param in self.named_parameters():
+            if not any([exclude in name for exclude in self.frozen_exclude]):
+                param.requires_grad = False
+
+    def _build_attn_biases(self, attn_biases, target_shape):
+        formatted_attn_biases = []
+        for attn_bias in attn_biases:
+            # convert it to proper format: N*num_head,L,L
+            # attn_bias: [N, num_head/1, num_sos,H,W]
+            n, num_head, num_sos, h, w = attn_bias.shape
+            # reshape and downsample
+            attn_bias = F.adaptive_max_pool2d(
+                attn_bias.reshape(n, num_head * num_sos, h, w),
+                output_size=target_shape)
+            attn_bias = attn_bias.reshape(n, num_head, num_sos, *target_shape)
+
+            true_num_head = self.num_heads
+            assert (num_head == 1 or num_head
+                    == true_num_head), f'num_head={num_head} is not supported.'
+            if num_head == 1:
+                attn_bias = attn_bias.repeat(1, true_num_head, 1, 1, 1)
+            attn_bias = attn_bias.reshape(n * true_num_head, num_sos, -1)
+            L = attn_bias.shape[-1]
+            if self.cross_attn:
+                # [n*num_head, num_sos, L]
+                formatted_attn_biases.append(attn_bias)
+            else:
+                # [n*num_head, num_sos+1+L, num_sos+1+L]
+                new_attn_bias = attn_bias.new_zeros(num_sos + 1 + L,
+                                                    num_sos + 1 + L)
+                new_attn_bias[:, :num_sos] = -100
+                new_attn_bias[torch.arange(num_sos), torch.arange(num_sos)] = 0
+                new_attn_bias[:num_sos, num_sos] = -100
+                new_attn_bias = (
+                    new_attn_bias[None, ...].expand(n * true_num_head, -1,
+                                                    -1).clone())
+                new_attn_bias[..., :num_sos, -L:] = attn_bias
+                formatted_attn_biases.append(new_attn_bias)
+
+        if len(formatted_attn_biases) == 1:
+            formatted_attn_biases = [
+                formatted_attn_biases[0] for _ in range(self.num_layers)
+            ]
+        return formatted_attn_biases
+
+    def forward(self, bias: List[Tensor], feature: List[Tensor]):
+        """Forward function to recognize the category of masks
+        Args:
+            bias (List[Tensor]): Attention bias for transformer layers
+            feature (List[Tensor]): Output of the image encoder,
+            including cls_token and img_feature.
+        """
+        cls_token = feature[1].unsqueeze(0)
+        img_feature = feature[0]
+        b, c, h, w = img_feature.shape
+        # construct clip shadow features
+        x = torch.cat(
+            [cls_token,
+             img_feature.reshape(b, c, -1).permute(2, 0, 1)])
+
+        # construct sos token
+        if self.sos_token_format == 'cls_token':
+            sos_token = cls_token.repeat(self.sos_token_num, 1, 1)
+        elif self.sos_token_format == 'learnable_token':
+            sos_token = self.sos_token.expand(-1, b, -1)
+        elif self.sos_token_format == 'pos_embedding':
+            sos_token = self.sos_token.expand(-1, b, -1) + cls_token
+
+        # construct attn bias
+        attn_biases = self._build_attn_biases(bias, target_shape=(h, w))
+
+        if self.cross_attn:
+            for i, block in enumerate(self.layers):
+                if self.cross_attn:
+                    sos_token = cross_attn_layer(
+                        block,
+                        sos_token,
+                        x[1:, ],
+                        attn_biases[i],
+                    )
+                    if i < len(self.layers) - 1:
+                        x = block(x)
+        else:
+            x = torch.cat([sos_token, x], dim=0)
+            for i, block in enumerate(self.layers):
+                x = block(x, attn_masks=[attn_biases[i]])
+            sos_token = x[:self.sos_token_num]
+
+        sos_token = sos_token.permute(1, 0, 2)  # LND -> NLD
+        sos_token = self.ln_post(sos_token)
+        sos_token = self.proj(sos_token)
+        if self.final_norm:
+            sos_token = F.normalize(sos_token, dim=-1)
+        return sos_token
+
+
+@MODELS.register_module()
+class SideAdapterCLIPHead(BaseDecodeHead):
+    """Side Adapter Network (SAN) for open-vocabulary semantic segmentation
+    with pre-trained vision-language model.
+
+    This decode head is the implementation of `Side Adapter Network
+    for Open-Vocabulary Semantic Segmentation`
+    <https://arxiv.org/abs/2302.12242>.
+    Modified from https://github.com/MendelXu/SAN/blob/main/san/model/side_adapter/side_adapter.py # noqa:E501
+    Copyright (c) 2023 MendelXu.
+    Licensed under the MIT License
+
+    Args:
+        num_classes (int): the number of classes.
+        san_cfg (ConfigType): Configs for SideAdapterNetwork module
+        maskgen_cfg (ConfigType): Configs for RecWithAttnbias module
+    """
+
+    def __init__(self, num_classes: int, san_cfg: ConfigType,
+                 maskgen_cfg: ConfigType, deep_supervision_idxs: List[int],
+                 train_cfg: ConfigType, **kwargs):
+        super().__init__(
+            in_channels=san_cfg.in_channels,
+            channels=san_cfg.embed_dims,
+            num_classes=num_classes,
+            **kwargs)
+        assert san_cfg.num_queries == maskgen_cfg.sos_token_num, \
+            'num_queries in san_cfg should be equal to sos_token_num ' \
+            'in maskgen_cfg'
+        del self.conv_seg
+        self.side_adapter_network = SideAdapterNetwork(**san_cfg)
+        self.rec_with_attnbias = RecWithAttnbias(**maskgen_cfg)
+        self.deep_supervision_idxs = deep_supervision_idxs
+        self.train_cfg = train_cfg
+        if train_cfg:
+            self.match_masks = MatchMasks(
+                num_points=train_cfg.num_points,
+                num_queries=san_cfg.num_queries,
+                num_classes=num_classes,
+                assigner=train_cfg.assigner)
+
+    def init_weights(self):
+
+        rec_state_dict = None
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') == 'Pretrained_Part':
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            rec_state_dict = checkpoint.copy()
+            para_prefix = 'decode_head.rec_with_attnbias'
+            prefix_len = len(para_prefix) + 1
+            for k, v in checkpoint.items():
+                rec_state_dict.pop(k)
+                if para_prefix in k:
+                    rec_state_dict[k[prefix_len:]] = v
+
+        self.side_adapter_network.init_weights()
+        self.rec_with_attnbias.init_weights(rec_state_dict)
+
+    def forward(self, inputs: Tuple[Tensor],
+                deep_supervision_idxs) -> Tuple[List]:
+        """Forward function.
+
+        Args:
+            inputs (Tuple[Tensor]): A triplet including images,
+            list of multi-level visual features from image encoder and
+            class embeddings from text_encoder.
+
+        Returns:
+            mask_props (List[Tensor]): Mask proposals predicted by SAN.
+            mask_logits (List[Tensor]): Class logits of mask proposals.
+        """
+        imgs, clip_feature, class_embeds = inputs
+        # predict mask proposals and attention bias
+        mask_props, attn_biases = self.side_adapter_network(
+            imgs, clip_feature, deep_supervision_idxs)
+
+        # mask recognition with attention bias
+        mask_embeds = [
+            self.rec_with_attnbias(att_bias, clip_feature[-1])
+            for att_bias in attn_biases
+        ]
+        # Obtain class prediction of masks by comparing the similarity
+        # between the image token and the text embedding of class names.
+        mask_logits = [
+            torch.einsum('bqc,nc->bqn', mask_embed, class_embeds)
+            for mask_embed in mask_embeds
+        ]
+        return mask_props, mask_logits
+
+    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg: ConfigType) -> Tensor:
+        """Forward function for prediction.
+
+        Args:
+            inputs (Tuple[Tensor]): Images, visual features from image encoder
+            and class embedding from text encoder.
+            batch_img_metas (dict): List Image info where each dict may also
+                contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+            test_cfg (dict): The testing config.
+
+        Returns:
+            Tensor: Outputs segmentation logits map.
+        """
+        mask_props, mask_logits = self.forward(inputs, [])
+
+        return self.predict_by_feat([mask_props[-1], mask_logits[-1]],
+                                    batch_img_metas)
+
+    def predict_by_feat(self, seg_logits: List[Tensor],
+                        batch_img_metas: List[dict]) -> Tensor:
+        """1. Transform a batch of mask proposals to the input shape.
+           2. Generate segmentation map with mask proposals and class logits.
+        """
+        mask_pred = seg_logits[0]
+        cls_score = seg_logits[1]
+        if isinstance(batch_img_metas[0]['img_shape'], torch.Size):
+            # slide inference
+            size = batch_img_metas[0]['img_shape']
+        elif 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape'][:2]
+        else:
+            size = batch_img_metas[0]['img_shape']
+        # upsample mask
+        mask_pred = F.interpolate(
+            mask_pred, size=size, mode='bilinear', align_corners=False)
+
+        mask_cls = F.softmax(cls_score, dim=-1)[..., :-1]
+        mask_pred = mask_pred.sigmoid()
+        seg_logits = torch.einsum('bqc,bqhw->bchw', mask_cls, mask_pred)
+        return seg_logits
+
+    def loss(self, x: Tuple[Tensor], batch_data_samples: SampleList,
+             train_cfg: ConfigType) -> dict:
+        """Perform forward propagation and loss calculation of the decoder head
+        on the features of the upstream network.
+
+        Args:
+            x (tuple[Tensor]): Multi-level features from the upstream
+                network, each is a 4D-tensor.
+            batch_data_samples (List[:obj:`SegDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_sem_seg`.
+            train_cfg (ConfigType): Training config.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components.
+        """
+        # batch SegDataSample to InstanceDataSample
+        batch_gt_instances = seg_data_to_instance_data(self.ignore_index,
+                                                       batch_data_samples)
+
+        # forward
+        all_mask_props, all_mask_logits = self.forward(
+            x, self.deep_supervision_idxs)
+
+        # loss
+        losses = self.loss_by_feat(all_mask_logits, all_mask_props,
+                                   batch_gt_instances)
+
+        return losses
+
+    def loss_by_feat(
+            self, all_cls_scores: Tensor, all_mask_preds: Tensor,
+            batch_gt_instances: List[InstanceData]) -> Dict[str, Tensor]:
+        """Loss function.
+
+        Args:
+            all_cls_scores (Tensor): Classification scores for all decoder
+                layers with shape (num_decoder, batch_size, num_queries,
+                cls_out_channels). Note `cls_out_channels` should includes
+                background.
+            all_mask_preds (Tensor): Mask scores for all decoder layers with
+                shape (num_decoder, batch_size, num_queries, h, w).
+            batch_gt_instances (list[obj:`InstanceData`]): each contains
+                ``labels`` and ``masks``.
+
+        Returns:
+            dict[str, Tensor]: A dictionary of loss components.
+        """
+        num_dec_layers = len(all_cls_scores)
+        batch_gt_instances_list = [
+            batch_gt_instances for _ in range(num_dec_layers)
+        ]
+
+        losses = []
+        for i in range(num_dec_layers):
+            cls_scores = all_cls_scores[i]
+            mask_preds = all_mask_preds[i]
+            # matching N mask predictions to K category labels
+            (labels, mask_targets, mask_weights,
+             avg_factor) = self.match_masks.get_targets(
+                 cls_scores, mask_preds, batch_gt_instances_list[i])
+            cls_scores = cls_scores.flatten(0, 1)
+            labels = labels.flatten(0, 1)
+            num_total_masks = cls_scores.new_tensor([avg_factor],
+                                                    dtype=torch.float)
+            all_reduce(num_total_masks, op='mean')
+            num_total_masks = max(num_total_masks, 1)
+
+            # extract positive ones
+            # shape (batch_size, num_queries, h, w) -> (num_total_gts, h, w)
+            mask_preds = mask_preds[mask_weights > 0]
+
+            if mask_targets.shape[0] != 0:
+                with torch.no_grad():
+                    points_coords = get_uncertain_point_coords_with_randomness(
+                        mask_preds.unsqueeze(1), None,
+                        self.train_cfg.num_points,
+                        self.train_cfg.oversample_ratio,
+                        self.train_cfg.importance_sample_ratio)
+                    # shape (num_total_gts, h, w)
+                    # -> (num_total_gts, num_points)
+                    mask_point_targets = point_sample(
+                        mask_targets.unsqueeze(1).float(),
+                        points_coords).squeeze(1)
+                # shape (num_queries, h, w) -> (num_queries, num_points)
+                mask_point_preds = point_sample(
+                    mask_preds.unsqueeze(1), points_coords).squeeze(1)
+
+            if not isinstance(self.loss_decode, nn.ModuleList):
+                losses_decode = [self.loss_decode]
+            else:
+                losses_decode = self.loss_decode
+            loss = dict()
+            for loss_decode in losses_decode:
+                if 'loss_cls' in loss_decode.loss_name:
+                    if loss_decode.loss_name == 'loss_cls_ce':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            cls_scores, labels)
+                    else:
+                        assert False, "Only support 'CrossEntropyLoss' in" \
+                                      ' classification loss'
+
+                elif 'loss_mask' in loss_decode.loss_name:
+                    if mask_targets.shape[0] == 0:
+                        loss[loss_decode.loss_name] = mask_preds.sum()
+                    elif loss_decode.loss_name == 'loss_mask_ce':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            mask_point_preds,
+                            mask_point_targets,
+                            avg_factor=num_total_masks *
+                            self.train_cfg.num_points)
+                    elif loss_decode.loss_name == 'loss_mask_dice':
+                        loss[loss_decode.loss_name] = loss_decode(
+                            mask_point_preds,
+                            mask_point_targets,
+                            avg_factor=num_total_masks)
+                    else:
+                        assert False, "Only support 'CrossEntropyLoss' and" \
+                                      " 'DiceLoss' in mask loss"
+                else:
+                    assert False, "Only support for 'loss_cls' and 'loss_mask'"
+
+            losses.append(loss)
+
+        loss_dict = dict()
+        # loss from the last decoder layer
+        loss_dict.update(losses[-1])
+        # loss from other decoder layers
+        for i, loss in enumerate(losses[:-1]):
+            for k, v in loss.items():
+                loss_dict[f'd{self.deep_supervision_idxs[i]}.{k}'] = v
+        return loss_dict
diff --git a/mmseg/models/decode_heads/segformer_head.py b/mmseg/models/decode_heads/segformer_head.py
index 8c14602a30..f9eb0b320b 100644
--- a/mmseg/models/decode_heads/segformer_head.py
+++ b/mmseg/models/decode_heads/segformer_head.py
@@ -4,8 +4,8 @@
 from mmcv.cnn import ConvModule
 
 from mmseg.models.decode_heads.decode_head import BaseDecodeHead
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 
 
 @MODELS.register_module()
diff --git a/mmseg/models/decode_heads/segmenter_mask_head.py b/mmseg/models/decode_heads/segmenter_mask_head.py
index 95a85a9e3c..85d27735ba 100644
--- a/mmseg/models/decode_heads/segmenter_mask_head.py
+++ b/mmseg/models/decode_heads/segmenter_mask_head.py
@@ -3,9 +3,9 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import build_norm_layer
-from mmcv.cnn.utils.weight_init import (constant_init, trunc_normal_,
+from mmengine.model import ModuleList
+from mmengine.model.weight_init import (constant_init, trunc_normal_,
                                         trunc_normal_init)
-from mmcv.runner import ModuleList
 
 from mmseg.models.backbones.vit import TransformerEncoderLayer
 from mmseg.registry import MODELS
@@ -17,7 +17,7 @@ class SegmenterMaskTransformerHead(BaseDecodeHead):
     """Segmenter: Transformer for Semantic Segmentation.
 
     This head is the implementation of
-    `Segmenter:　<https://arxiv.org/abs/2105.05633>`_.
+    `Segmenter: <https://arxiv.org/abs/2105.05633>`_.
 
     Args:
         backbone_cfg:(dict): Config of backbone of
@@ -61,8 +61,7 @@ def __init__(
             init_std=0.02,
             **kwargs,
     ):
-        super(SegmenterMaskTransformerHead, self).__init__(
-            in_channels=in_channels, **kwargs)
+        super().__init__(in_channels=in_channels, **kwargs)
 
         dpr = [x.item() for x in torch.linspace(0, drop_path_rate, num_layers)]
         self.layers = ModuleList()
diff --git a/mmseg/models/decode_heads/sep_aspp_head.py b/mmseg/models/decode_heads/sep_aspp_head.py
index c632179319..9dba68c9ec 100644
--- a/mmseg/models/decode_heads/sep_aspp_head.py
+++ b/mmseg/models/decode_heads/sep_aspp_head.py
@@ -3,8 +3,8 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .aspp_head import ASPPHead, ASPPModule
 
 
@@ -13,7 +13,7 @@ class DepthwiseSeparableASPPModule(ASPPModule):
     conv."""
 
     def __init__(self, **kwargs):
-        super(DepthwiseSeparableASPPModule, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         for i, dilation in enumerate(self.dilations):
             if dilation > 1:
                 self[i] = DepthwiseSeparableConvModule(
@@ -41,7 +41,7 @@ class DepthwiseSeparableASPPHead(ASPPHead):
     """
 
     def __init__(self, c1_in_channels, c1_channels, **kwargs):
-        super(DepthwiseSeparableASPPHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         assert c1_in_channels >= 0
         self.aspp_modules = DepthwiseSeparableASPPModule(
             dilations=self.dilations,
diff --git a/mmseg/models/decode_heads/sep_fcn_head.py b/mmseg/models/decode_heads/sep_fcn_head.py
index 5c8b79bd0e..3b15983bce 100644
--- a/mmseg/models/decode_heads/sep_fcn_head.py
+++ b/mmseg/models/decode_heads/sep_fcn_head.py
@@ -32,7 +32,7 @@ class DepthwiseSeparableFCNHead(FCNHead):
     """
 
     def __init__(self, dw_act_cfg=None, **kwargs):
-        super(DepthwiseSeparableFCNHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.convs[0] = DepthwiseSeparableConvModule(
             self.in_channels,
             self.channels,
diff --git a/mmseg/models/decode_heads/setr_mla_head.py b/mmseg/models/decode_heads/setr_mla_head.py
index 228c311002..1975991a60 100644
--- a/mmseg/models/decode_heads/setr_mla_head.py
+++ b/mmseg/models/decode_heads/setr_mla_head.py
@@ -3,8 +3,8 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 
-from mmseg.ops import Upsample
 from mmseg.registry import MODELS
+from ..utils import Upsample
 from .decode_head import BaseDecodeHead
 
 
@@ -21,8 +21,7 @@ class SETRMLAHead(BaseDecodeHead):
     """
 
     def __init__(self, mla_channels=128, up_scale=4, **kwargs):
-        super(SETRMLAHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
+        super().__init__(input_transform='multiple_select', **kwargs)
         self.mla_channels = mla_channels
 
         num_inputs = len(self.in_channels)
diff --git a/mmseg/models/decode_heads/setr_up_head.py b/mmseg/models/decode_heads/setr_up_head.py
index 11ab9bb7d0..9c796d8161 100644
--- a/mmseg/models/decode_heads/setr_up_head.py
+++ b/mmseg/models/decode_heads/setr_up_head.py
@@ -2,8 +2,8 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule, build_norm_layer
 
-from mmseg.ops import Upsample
 from mmseg.registry import MODELS
+from ..utils import Upsample
 from .decode_head import BaseDecodeHead
 
 
@@ -41,7 +41,7 @@ def __init__(self,
 
         assert kernel_size in [1, 3], 'kernel_size must be 1 or 3.'
 
-        super(SETRUPHead, self).__init__(init_cfg=init_cfg, **kwargs)
+        super().__init__(init_cfg=init_cfg, **kwargs)
 
         assert isinstance(self.in_channels, int)
 
diff --git a/mmseg/models/decode_heads/stdc_head.py b/mmseg/models/decode_heads/stdc_head.py
index 615b858186..1c1c21e308 100644
--- a/mmseg/models/decode_heads/stdc_head.py
+++ b/mmseg/models/decode_heads/stdc_head.py
@@ -1,11 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
 import torch.nn.functional as F
-from mmengine.data import PixelData
+from mmengine.structures import PixelData
 from torch import Tensor
 
-from mmseg.data import SegDataSample
 from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
 from mmseg.utils import SampleList
 from .fcn_head import FCNHead
 
@@ -21,7 +21,7 @@ class STDCHead(FCNHead):
     """
 
     def __init__(self, boundary_threshold=0.1, **kwargs):
-        super(STDCHead, self).__init__(**kwargs)
+        super().__init__(**kwargs)
         self.boundary_threshold = boundary_threshold
         # Using register buffer to make laplacian kernel on the same
         # device of `seg_label`.
@@ -93,6 +93,5 @@ def loss_by_feat(self, seg_logits: Tensor,
             seg_data_sample.gt_sem_seg = PixelData(data=label)
             batch_sample_list.append(seg_data_sample)
 
-        loss = super(STDCHead, self).loss_by_feat(seg_logits,
-                                                  batch_sample_list)
+        loss = super().loss_by_feat(seg_logits, batch_sample_list)
         return loss
diff --git a/mmseg/models/decode_heads/uper_head.py b/mmseg/models/decode_heads/uper_head.py
index 347ef32924..b1ccc3173c 100644
--- a/mmseg/models/decode_heads/uper_head.py
+++ b/mmseg/models/decode_heads/uper_head.py
@@ -3,8 +3,8 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 from .decode_head import BaseDecodeHead
 from .psp_head import PPM
 
@@ -22,8 +22,7 @@ class UPerHead(BaseDecodeHead):
     """
 
     def __init__(self, pool_scales=(1, 2, 3, 6), **kwargs):
-        super(UPerHead, self).__init__(
-            input_transform='multiple_select', **kwargs)
+        super().__init__(input_transform='multiple_select', **kwargs)
         # PSP Module
         self.psp_modules = PPM(
             pool_scales,
diff --git a/mmseg/models/decode_heads/vpd_depth_head.py b/mmseg/models/decode_heads/vpd_depth_head.py
new file mode 100644
index 0000000000..0c54c2da1b
--- /dev/null
+++ b/mmseg/models/decode_heads/vpd_depth_head.py
@@ -0,0 +1,254 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional, Sequence, Union
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer, build_norm_layer, build_upsample_layer
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+from ..builder import build_loss
+from ..utils import resize
+from .decode_head import BaseDecodeHead
+
+
+class VPDDepthDecoder(BaseModule):
+    """VPD Depth Decoder class.
+
+    Args:
+        in_channels (int): Number of input channels.
+        out_channels (int): Number of output channels.
+        num_deconv_layers (int): Number of deconvolution layers.
+        num_deconv_filters (List[int]): List of output channels for
+            deconvolution layers.
+        init_cfg (Optional[Union[Dict, List[Dict]]], optional): Configuration
+            for weight initialization. Defaults to Normal for Conv2d and
+            ConvTranspose2d layers.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 out_channels: int,
+                 num_deconv_layers: int,
+                 num_deconv_filters: List[int],
+                 init_cfg: Optional[Union[Dict, List[Dict]]] = dict(
+                     type='Normal',
+                     std=0.001,
+                     layer=['Conv2d', 'ConvTranspose2d'])):
+        super().__init__(init_cfg=init_cfg)
+        self.in_channels = in_channels
+
+        self.deconv_layers = self._make_deconv_layer(
+            num_deconv_layers,
+            num_deconv_filters,
+        )
+
+        conv_layers = []
+        conv_layers.append(
+            build_conv_layer(
+                dict(type='Conv2d'),
+                in_channels=num_deconv_filters[-1],
+                out_channels=out_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1))
+        conv_layers.append(build_norm_layer(dict(type='BN'), out_channels)[1])
+        conv_layers.append(nn.ReLU(inplace=True))
+        self.conv_layers = nn.Sequential(*conv_layers)
+
+        self.up_sample = nn.Upsample(
+            scale_factor=2, mode='bilinear', align_corners=False)
+
+    def forward(self, x):
+        """Forward pass through the decoder network."""
+        out = self.deconv_layers(x)
+        out = self.conv_layers(out)
+
+        out = self.up_sample(out)
+        out = self.up_sample(out)
+
+        return out
+
+    def _make_deconv_layer(self, num_layers, num_deconv_filters):
+        """Make deconv layers."""
+
+        layers = []
+        in_channels = self.in_channels
+        for i in range(num_layers):
+
+            num_channels = num_deconv_filters[i]
+            layers.append(
+                build_upsample_layer(
+                    dict(type='deconv'),
+                    in_channels=in_channels,
+                    out_channels=num_channels,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    output_padding=0,
+                    bias=False))
+            layers.append(nn.BatchNorm2d(num_channels))
+            layers.append(nn.ReLU(inplace=True))
+            in_channels = num_channels
+
+        return nn.Sequential(*layers)
+
+
+@MODELS.register_module()
+class VPDDepthHead(BaseDecodeHead):
+    """Depth Prediction Head for VPD.
+
+    .. _`VPD`: https://arxiv.org/abs/2303.02153
+
+    Args:
+        max_depth (float): Maximum depth value. Defaults to 10.0.
+        in_channels (Sequence[int]): Number of input channels for each
+            convolutional layer.
+        embed_dim (int): Dimension of embedding. Defaults to 192.
+        feature_dim (int): Dimension of aggregated feature. Defaults to 1536.
+        num_deconv_layers (int): Number of deconvolution layers in the
+            decoder. Defaults to 3.
+        num_deconv_filters (Sequence[int]): Number of filters for each deconv
+            layer. Defaults to (32, 32, 32).
+        fmap_border (Union[int, Sequence[int]]): Feature map border for
+            cropping. Defaults to 0.
+        align_corners (bool): Flag for align_corners in interpolation.
+            Defaults to False.
+        loss_decode (dict): Configurations for the loss function. Defaults to
+            dict(type='SiLogLoss').
+        init_cfg (dict): Initialization configurations. Defaults to
+            dict(type='TruncNormal', std=0.02, layer=['Conv2d', 'Linear']).
+    """
+
+    num_classes = 1
+    out_channels = 1
+    input_transform = None
+
+    def __init__(
+        self,
+        max_depth: float = 10.0,
+        in_channels: Sequence[int] = [320, 640, 1280, 1280],
+        embed_dim: int = 192,
+        feature_dim: int = 1536,
+        num_deconv_layers: int = 3,
+        num_deconv_filters: Sequence[int] = (32, 32, 32),
+        fmap_border: Union[int, Sequence[int]] = 0,
+        align_corners: bool = False,
+        loss_decode: dict = dict(type='SiLogLoss'),
+        init_cfg=dict(
+            type='TruncNormal', std=0.02, layer=['Conv2d', 'Linear']),
+    ):
+
+        super(BaseDecodeHead, self).__init__(init_cfg=init_cfg)
+
+        # initialize parameters
+        self.in_channels = in_channels
+        self.max_depth = max_depth
+        self.align_corners = align_corners
+
+        # feature map border
+        if isinstance(fmap_border, int):
+            fmap_border = (fmap_border, fmap_border)
+        self.fmap_border = fmap_border
+
+        # define network layers
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(in_channels[0], in_channels[0], 3, stride=2, padding=1),
+            nn.GroupNorm(16, in_channels[0]),
+            nn.ReLU(),
+            nn.Conv2d(in_channels[0], in_channels[0], 3, stride=2, padding=1),
+        )
+        self.conv2 = nn.Conv2d(
+            in_channels[1], in_channels[1], 3, stride=2, padding=1)
+
+        self.conv_aggregation = nn.Sequential(
+            nn.Conv2d(sum(in_channels), feature_dim, 1),
+            nn.GroupNorm(16, feature_dim),
+            nn.ReLU(),
+        )
+
+        self.decoder = VPDDepthDecoder(
+            in_channels=embed_dim * 8,
+            out_channels=embed_dim,
+            num_deconv_layers=num_deconv_layers,
+            num_deconv_filters=num_deconv_filters)
+
+        self.depth_pred_layer = nn.Sequential(
+            nn.Conv2d(
+                embed_dim, embed_dim, kernel_size=3, stride=1, padding=1),
+            nn.ReLU(inplace=False),
+            nn.Conv2d(embed_dim, 1, kernel_size=3, stride=1, padding=1))
+
+        # build loss
+        if isinstance(loss_decode, dict):
+            self.loss_decode = build_loss(loss_decode)
+        elif isinstance(loss_decode, (list, tuple)):
+            self.loss_decode = nn.ModuleList()
+            for loss in loss_decode:
+                self.loss_decode.append(build_loss(loss))
+        else:
+            raise TypeError(f'loss_decode must be a dict or sequence of dict,\
+                but got {type(loss_decode)}')
+
+    def _stack_batch_gt(self, batch_data_samples: SampleList) -> Tensor:
+        gt_depth_maps = [
+            data_sample.gt_depth_map.data for data_sample in batch_data_samples
+        ]
+        return torch.stack(gt_depth_maps, dim=0)
+
+    def forward(self, x):
+        x = [
+            x[0], x[1],
+            torch.cat([x[2], F.interpolate(x[3], scale_factor=2)], dim=1)
+        ]
+        x = torch.cat([self.conv1(x[0]), self.conv2(x[1]), x[2]], dim=1)
+        x = self.conv_aggregation(x)
+
+        x = x[:, :, :x.size(2) - self.fmap_border[0], :x.size(3) -
+              self.fmap_border[1]].contiguous()
+        x = self.decoder(x)
+        out = self.depth_pred_layer(x)
+
+        depth = torch.sigmoid(out) * self.max_depth
+
+        return depth
+
+    def loss_by_feat(self, pred_depth_map: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        """Compute depth estimation loss.
+
+        Args:
+            pred_depth_map (Tensor): The output from decode head forward
+                function.
+            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_dpeth_map`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+
+        gt_depth_map = self._stack_batch_gt(batch_data_samples)
+        loss = dict()
+        pred_depth_map = resize(
+            input=pred_depth_map,
+            size=gt_depth_map.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        if not isinstance(self.loss_decode, nn.ModuleList):
+            losses_decode = [self.loss_decode]
+        else:
+            losses_decode = self.loss_decode
+        for loss_decode in losses_decode:
+            if loss_decode.loss_name not in loss:
+                loss[loss_decode.loss_name] = loss_decode(
+                    pred_depth_map, gt_depth_map)
+            else:
+                loss[loss_decode.loss_name] += loss_decode(
+                    pred_depth_map, gt_depth_map)
+
+        return loss
diff --git a/mmseg/models/losses/__init__.py b/mmseg/models/losses/__init__.py
index fbc5b2d1b9..0467cb3ad8 100644
--- a/mmseg/models/losses/__init__.py
+++ b/mmseg/models/losses/__init__.py
@@ -1,15 +1,21 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .accuracy import Accuracy, accuracy
+from .boundary_loss import BoundaryLoss
 from .cross_entropy_loss import (CrossEntropyLoss, binary_cross_entropy,
                                  cross_entropy, mask_cross_entropy)
 from .dice_loss import DiceLoss
 from .focal_loss import FocalLoss
+from .huasdorff_distance_loss import HuasdorffDisstanceLoss
 from .lovasz_loss import LovaszLoss
+from .ohem_cross_entropy_loss import OhemCrossEntropy
+from .silog_loss import SiLogLoss
+from .tversky_loss import TverskyLoss
 from .utils import reduce_loss, weight_reduce_loss, weighted_loss
 
 __all__ = [
     'accuracy', 'Accuracy', 'cross_entropy', 'binary_cross_entropy',
     'mask_cross_entropy', 'CrossEntropyLoss', 'reduce_loss',
     'weight_reduce_loss', 'weighted_loss', 'LovaszLoss', 'DiceLoss',
-    'FocalLoss'
+    'FocalLoss', 'TverskyLoss', 'OhemCrossEntropy', 'BoundaryLoss',
+    'HuasdorffDisstanceLoss', 'SiLogLoss'
 ]
diff --git a/mmseg/models/losses/boundary_loss.py b/mmseg/models/losses/boundary_loss.py
new file mode 100644
index 0000000000..e86b850d87
--- /dev/null
+++ b/mmseg/models/losses/boundary_loss.py
@@ -0,0 +1,62 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class BoundaryLoss(nn.Module):
+    """Boundary loss.
+
+    This function is modified from
+    `PIDNet <https://github.com/XuJiacong/PIDNet/blob/main/utils/criterion.py#L122>`_.  # noqa
+    Licensed under the MIT License.
+
+
+    Args:
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 loss_weight: float = 1.0,
+                 loss_name: str = 'loss_boundary'):
+        super().__init__()
+        self.loss_weight = loss_weight
+        self.loss_name_ = loss_name
+
+    def forward(self, bd_pre: Tensor, bd_gt: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            bd_pre (Tensor): Predictions of the boundary head.
+            bd_gt (Tensor): Ground truth of the boundary.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        log_p = bd_pre.permute(0, 2, 3, 1).contiguous().view(1, -1)
+        target_t = bd_gt.view(1, -1).float()
+
+        pos_index = (target_t == 1)
+        neg_index = (target_t == 0)
+
+        weight = torch.zeros_like(log_p)
+        pos_num = pos_index.sum()
+        neg_num = neg_index.sum()
+        sum_num = pos_num + neg_num
+        weight[pos_index] = neg_num * 1.0 / sum_num
+        weight[neg_index] = pos_num * 1.0 / sum_num
+
+        loss = F.binary_cross_entropy_with_logits(
+            log_p, target_t, weight, reduction='mean')
+
+        return self.loss_weight * loss
+
+    @property
+    def loss_name(self):
+        return self.loss_name_
diff --git a/mmseg/models/losses/cross_entropy_loss.py b/mmseg/models/losses/cross_entropy_loss.py
index e607248fb8..988fb789c1 100644
--- a/mmseg/models/losses/cross_entropy_loss.py
+++ b/mmseg/models/losses/cross_entropy_loss.py
@@ -53,8 +53,23 @@ def cross_entropy(pred,
     # average loss over non-ignored elements
     # pytorch's official cross_entropy average loss over non-ignored elements
     # refer to https://github.com/pytorch/pytorch/blob/56b43f4fec1f76953f15a627694d4bba34588969/torch/nn/functional.py#L2660  # noqa
-    if (avg_factor is None) and avg_non_ignore and reduction == 'mean':
-        avg_factor = label.numel() - (label == ignore_index).sum().item()
+    if (avg_factor is None) and reduction == 'mean':
+        if class_weight is None:
+            if avg_non_ignore:
+                avg_factor = label.numel() - (label
+                                              == ignore_index).sum().item()
+            else:
+                avg_factor = label.numel()
+
+        else:
+            # the average factor should take the class weights into account
+            label_weights = torch.stack([class_weight[cls] for cls in label
+                                         ]).to(device=class_weight.device)
+
+            if avg_non_ignore:
+                label_weights[label == ignore_index] = 0
+            avg_factor = label_weights.sum()
+
     if weight is not None:
         weight = weight.float()
     loss = weight_reduce_loss(
@@ -124,7 +139,7 @@ def binary_cross_entropy(pred,
         assert label[label != ignore_index].max() <= 1, \
             'For pred with shape [N, 1, H, W], its label must have at ' \
             'most 2 classes'
-        pred = pred.squeeze()
+        pred = pred.squeeze(1)
     if pred.dim() != label.dim():
         assert (pred.dim() == 2 and label.dim() == 1) or (
                 pred.dim() == 4 and label.dim() == 3), \
@@ -223,7 +238,7 @@ def __init__(self,
                  loss_weight=1.0,
                  loss_name='loss_ce',
                  avg_non_ignore=False):
-        super(CrossEntropyLoss, self).__init__()
+        super().__init__()
         assert (use_sigmoid is False) or (use_mask is False)
         self.use_sigmoid = use_sigmoid
         self.use_mask = use_mask
diff --git a/mmseg/models/losses/dice_loss.py b/mmseg/models/losses/dice_loss.py
index 4a98aaee9f..fb2ffdba8d 100644
--- a/mmseg/models/losses/dice_loss.py
+++ b/mmseg/models/losses/dice_loss.py
@@ -1,125 +1,190 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-"""Modified from https://github.com/LikeLy-Journey/SegmenTron/blob/master/
-segmentron/solver/loss.py (Apache-2.0 License)"""
+from typing import Union
+
 import torch
 import torch.nn as nn
-import torch.nn.functional as F
 
 from mmseg.registry import MODELS
-from .utils import get_class_weight, weighted_loss
-
-
-@weighted_loss
-def dice_loss(pred,
-              target,
-              valid_mask,
-              smooth=1,
-              exponent=2,
-              class_weight=None,
-              ignore_index=255):
-    assert pred.shape[0] == target.shape[0]
-    total_loss = 0
-    num_classes = pred.shape[1]
-    for i in range(num_classes):
-        if i != ignore_index:
-            dice_loss = binary_dice_loss(
-                pred[:, i],
-                target[..., i],
-                valid_mask=valid_mask,
-                smooth=smooth,
-                exponent=exponent)
-            if class_weight is not None:
-                dice_loss *= class_weight[i]
-            total_loss += dice_loss
-    return total_loss / num_classes
+from .utils import weight_reduce_loss
 
 
-@weighted_loss
-def binary_dice_loss(pred, target, valid_mask, smooth=1, exponent=2, **kwards):
-    assert pred.shape[0] == target.shape[0]
-    pred = pred.reshape(pred.shape[0], -1)
-    target = target.reshape(target.shape[0], -1)
-    valid_mask = valid_mask.reshape(valid_mask.shape[0], -1)
+def _expand_onehot_labels_dice(pred: torch.Tensor,
+                               target: torch.Tensor) -> torch.Tensor:
+    """Expand onehot labels to match the size of prediction.
 
-    num = torch.sum(torch.mul(pred, target) * valid_mask, dim=1) * 2 + smooth
-    den = torch.sum(pred.pow(exponent) + target.pow(exponent), dim=1) + smooth
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (N, num_class, H, W).
+        target (torch.Tensor): The learning label of the prediction,
+            has a shape (N, H, W).
 
-    return 1 - num / den
+    Returns:
+        torch.Tensor: The target after one-hot encoding,
+            has a shape (N, num_class, H, W).
+    """
+    num_classes = pred.shape[1]
+    one_hot_target = torch.clamp(target, min=0, max=num_classes)
+    one_hot_target = torch.nn.functional.one_hot(one_hot_target,
+                                                 num_classes + 1)
+    one_hot_target = one_hot_target[..., :num_classes].permute(0, 3, 1, 2)
+    return one_hot_target
+
+
+def dice_loss(pred: torch.Tensor,
+              target: torch.Tensor,
+              weight: Union[torch.Tensor, None],
+              eps: float = 1e-3,
+              reduction: Union[str, None] = 'mean',
+              naive_dice: Union[bool, None] = False,
+              avg_factor: Union[int, None] = None,
+              ignore_index: Union[int, None] = 255) -> float:
+    """Calculate dice loss, there are two forms of dice loss is supported:
+
+        - the one proposed in `V-Net: Fully Convolutional Neural
+            Networks for Volumetric Medical Image Segmentation
+            <https://arxiv.org/abs/1606.04797>`_.
+        - the dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.
+
+    Args:
+        pred (torch.Tensor): The prediction, has a shape (n, *)
+        target (torch.Tensor): The learning label of the prediction,
+            shape (n, *), same shape of pred.
+        weight (torch.Tensor, optional): The weight of loss for each
+            prediction, has a shape (n,). Defaults to None.
+        eps (float): Avoid dividing by zero. Default: 1e-3.
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+            Options are "none", "mean" and "sum".
+        naive_dice (bool, optional): If false, use the dice
+            loss defined in the V-Net paper, otherwise, use the
+            naive dice loss in which the power of the number in the
+            denominator is the first power instead of the second
+            power.Defaults to False.
+        avg_factor (int, optional): Average factor that is used to average
+            the loss. Defaults to None.
+        ignore_index (int, optional): The label index to be ignored.
+            Defaults to 255.
+    """
+    if ignore_index is not None:
+        num_classes = pred.shape[1]
+        pred = pred[:, torch.arange(num_classes) != ignore_index, :, :]
+        target = target[:, torch.arange(num_classes) != ignore_index, :, :]
+        assert pred.shape[1] != 0  # if the ignored index is the only class
+    input = pred.flatten(1)
+    target = target.flatten(1).float()
+    a = torch.sum(input * target, 1)
+    if naive_dice:
+        b = torch.sum(input, 1)
+        c = torch.sum(target, 1)
+        d = (2 * a + eps) / (b + c + eps)
+    else:
+        b = torch.sum(input * input, 1) + eps
+        c = torch.sum(target * target, 1) + eps
+        d = (2 * a) / (b + c)
+
+    loss = 1 - d
+    if weight is not None:
+        assert weight.ndim == loss.ndim
+        assert len(weight) == len(pred)
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
 
 
 @MODELS.register_module()
 class DiceLoss(nn.Module):
-    """DiceLoss.
-
-    This loss is proposed in `V-Net: Fully Convolutional Neural Networks for
-    Volumetric Medical Image Segmentation <https://arxiv.org/abs/1606.04797>`_.
-
-    Args:
-        smooth (float): A float number to smooth loss, and avoid NaN error.
-            Default: 1
-        exponent (float): An float number to calculate denominator
-            value: \\sum{x^exponent} + \\sum{y^exponent}. Default: 2.
-        reduction (str, optional): The method used to reduce the loss. Options
-            are "none", "mean" and "sum". This parameter only works when
-            per_image is True. Default: 'mean'.
-        class_weight (list[float] | str, optional): Weight of each class. If in
-            str format, read them from a file. Defaults to None.
-        loss_weight (float, optional): Weight of the loss. Default to 1.0.
-        ignore_index (int | None): The label index to be ignored. Default: 255.
-        loss_name (str, optional): Name of the loss item. If you want this loss
-            item to be included into the backward graph, `loss_` must be the
-            prefix of the name. Defaults to 'loss_dice'.
-    """
 
     def __init__(self,
-                 smooth=1,
-                 exponent=2,
+                 use_sigmoid=True,
+                 activate=True,
                  reduction='mean',
-                 class_weight=None,
+                 naive_dice=False,
                  loss_weight=1.0,
                  ignore_index=255,
-                 loss_name='loss_dice',
-                 **kwards):
-        super(DiceLoss, self).__init__()
-        self.smooth = smooth
-        self.exponent = exponent
+                 eps=1e-3,
+                 loss_name='loss_dice'):
+        """Compute dice loss.
+
+        Args:
+            use_sigmoid (bool, optional): Whether to the prediction is
+                used for sigmoid or softmax. Defaults to True.
+            activate (bool): Whether to activate the predictions inside,
+                this will disable the inside sigmoid operation.
+                Defaults to True.
+            reduction (str, optional): The method used
+                to reduce the loss. Options are "none",
+                "mean" and "sum". Defaults to 'mean'.
+            naive_dice (bool, optional): If false, use the dice
+                loss defined in the V-Net paper, otherwise, use the
+                naive dice loss in which the power of the number in the
+                denominator is the first power instead of the second
+                power. Defaults to False.
+            loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+            ignore_index (int, optional): The label index to be ignored.
+                Default: 255.
+            eps (float): Avoid dividing by zero. Defaults to 1e-3.
+            loss_name (str, optional): Name of the loss item. If you want this
+                loss item to be included into the backward graph, `loss_` must
+                be the prefix of the name. Defaults to 'loss_dice'.
+        """
+
+        super().__init__()
+        self.use_sigmoid = use_sigmoid
         self.reduction = reduction
-        self.class_weight = get_class_weight(class_weight)
+        self.naive_dice = naive_dice
         self.loss_weight = loss_weight
+        self.eps = eps
+        self.activate = activate
         self.ignore_index = ignore_index
         self._loss_name = loss_name
 
     def forward(self,
                 pred,
                 target,
+                weight=None,
                 avg_factor=None,
                 reduction_override=None,
-                **kwards):
+                ignore_index=255,
+                **kwargs):
+        """Forward function.
+
+        Args:
+            pred (torch.Tensor): The prediction, has a shape (n, *).
+            target (torch.Tensor): The label of the prediction,
+                shape (n, *), same shape of pred.
+            weight (torch.Tensor, optional): The weight of loss for each
+                prediction, has a shape (n,). Defaults to None.
+            avg_factor (int, optional): Average factor that is used to average
+                the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used to
+                override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+
+        Returns:
+            torch.Tensor: The calculated loss
+        """
+        one_hot_target = target
+        if (pred.shape != target.shape):
+            one_hot_target = _expand_onehot_labels_dice(pred, target)
         assert reduction_override in (None, 'none', 'mean', 'sum')
         reduction = (
             reduction_override if reduction_override else self.reduction)
-        if self.class_weight is not None:
-            class_weight = pred.new_tensor(self.class_weight)
-        else:
-            class_weight = None
-
-        pred = F.softmax(pred, dim=1)
-        num_classes = pred.shape[1]
-        one_hot_target = F.one_hot(
-            torch.clamp(target.long(), 0, num_classes - 1),
-            num_classes=num_classes)
-        valid_mask = (target != self.ignore_index).long()
-
+        if self.activate:
+            if self.use_sigmoid:
+                pred = pred.sigmoid()
+            elif pred.shape[1] != 1:
+                # softmax does not work when there is only 1 class
+                pred = pred.softmax(dim=1)
         loss = self.loss_weight * dice_loss(
             pred,
             one_hot_target,
-            valid_mask=valid_mask,
+            weight,
+            eps=self.eps,
             reduction=reduction,
+            naive_dice=self.naive_dice,
             avg_factor=avg_factor,
-            smooth=self.smooth,
-            exponent=self.exponent,
-            class_weight=class_weight,
             ignore_index=self.ignore_index)
+
         return loss
 
     @property
diff --git a/mmseg/models/losses/focal_loss.py b/mmseg/models/losses/focal_loss.py
index cd7eff4f62..6507ed7a91 100644
--- a/mmseg/models/losses/focal_loss.py
+++ b/mmseg/models/losses/focal_loss.py
@@ -78,7 +78,7 @@ def sigmoid_focal_loss(pred,
                        valid_mask=None,
                        reduction='mean',
                        avg_factor=None):
-    r"""A warpper of cuda version `Focal Loss
+    r"""A wrapper of cuda version `Focal Loss
     <https://arxiv.org/abs/1708.02002>`_.
     Args:
         pred (torch.Tensor): The prediction with shape (N, C), C is the number
@@ -172,7 +172,7 @@ def __init__(self,
                 loss item to be included into the backward graph, `loss_` must
                 be the prefix of the name. Defaults to 'loss_focal'.
         """
-        super(FocalLoss, self).__init__()
+        super().__init__()
         assert use_sigmoid is True, \
             'AssertionError: Only sigmoid focal loss supported now.'
         assert reduction in ('none', 'mean', 'sum'), \
@@ -271,7 +271,13 @@ def forward(self,
             num_classes = pred.size(1)
             if torch.cuda.is_available() and pred.is_cuda:
                 if target.dim() == 1:
-                    one_hot_target = F.one_hot(target, num_classes=num_classes)
+                    one_hot_target = F.one_hot(
+                        target, num_classes=num_classes + 1)
+                    if num_classes == 1:
+                        one_hot_target = one_hot_target[:, 1]
+                        target = 1 - target
+                    else:
+                        one_hot_target = one_hot_target[:, :num_classes]
                 else:
                     one_hot_target = target
                     target = target.argmax(dim=1)
@@ -280,7 +286,11 @@ def forward(self,
             else:
                 one_hot_target = None
                 if target.dim() == 1:
-                    target = F.one_hot(target, num_classes=num_classes)
+                    target = F.one_hot(target, num_classes=num_classes + 1)
+                    if num_classes == 1:
+                        target = target[:, 1]
+                    else:
+                        target = target[:, num_classes]
                 else:
                     valid_mask = (target.argmax(dim=1) != ignore_index).view(
                         -1, 1)
diff --git a/mmseg/models/losses/huasdorff_distance_loss.py b/mmseg/models/losses/huasdorff_distance_loss.py
new file mode 100644
index 0000000000..d950ba728f
--- /dev/null
+++ b/mmseg/models/losses/huasdorff_distance_loss.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from https://github.com/JunMa11/SegWithDistMap/blob/
+master/code/train_LA_HD.py (Apache-2.0 License)"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from scipy.ndimage import distance_transform_edt as distance
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from .utils import get_class_weight, weighted_loss
+
+
+def compute_dtm(img_gt: Tensor, pred: Tensor) -> Tensor:
+    """
+    compute the distance transform map of foreground in mask
+    Args:
+        img_gt: Ground truth of the image, (b, h, w)
+        pred: Predictions of the segmentation head after softmax, (b, c, h, w)
+
+    Returns:
+        output: the foreground Distance Map (SDM)
+        dtm(x) = 0; x in segmentation boundary
+                inf|x-y|; x in segmentation
+    """
+
+    fg_dtm = torch.zeros_like(pred)
+    out_shape = pred.shape
+    for b in range(out_shape[0]):  # batch size
+        for c in range(1, out_shape[1]):  # default 0 channel is background
+            posmask = img_gt[b].byte()
+            if posmask.any():
+                posdis = distance(posmask)
+                fg_dtm[b][c] = torch.from_numpy(posdis)
+
+    return fg_dtm
+
+
+@weighted_loss
+def hd_loss(seg_soft: Tensor,
+            gt: Tensor,
+            seg_dtm: Tensor,
+            gt_dtm: Tensor,
+            class_weight=None,
+            ignore_index=255) -> Tensor:
+    """
+    compute huasdorff distance loss for segmentation
+    Args:
+        seg_soft: softmax results, shape=(b,c,x,y)
+        gt: ground truth, shape=(b,x,y)
+        seg_dtm: segmentation distance transform map, shape=(b,c,x,y)
+        gt_dtm: ground truth distance transform map, shape=(b,c,x,y)
+
+    Returns:
+        output: hd_loss
+    """
+    assert seg_soft.shape[0] == gt.shape[0]
+    total_loss = 0
+    num_class = seg_soft.shape[1]
+    if class_weight is not None:
+        assert class_weight.ndim == num_class
+    for i in range(1, num_class):
+        if i != ignore_index:
+            delta_s = (seg_soft[:, i, ...] - gt.float())**2
+            s_dtm = seg_dtm[:, i, ...]**2
+            g_dtm = gt_dtm[:, i, ...]**2
+            dtm = s_dtm + g_dtm
+            multiplied = torch.einsum('bxy, bxy->bxy', delta_s, dtm)
+            hd_loss = multiplied.mean()
+        if class_weight is not None:
+            hd_loss *= class_weight[i]
+        total_loss += hd_loss
+
+    return total_loss / num_class
+
+
+@MODELS.register_module()
+class HuasdorffDisstanceLoss(nn.Module):
+    """HuasdorffDisstanceLoss. This loss is proposed in `How Distance Transform
+    Maps Boost Segmentation CNNs: An Empirical Study.
+
+    <http://proceedings.mlr.press/v121/ma20b.html>`_.
+    Args:
+        reduction (str, optional): The method used to reduce the loss into
+            a scalar. Defaults to 'mean'.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 reduction='mean',
+                 class_weight=None,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 loss_name='loss_huasdorff_disstance',
+                 **kwargs):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.class_weight = get_class_weight(class_weight)
+        self._loss_name = loss_name
+        self.ignore_index = ignore_index
+
+    def forward(self,
+                pred: Tensor,
+                target: Tensor,
+                avg_factor=None,
+                reduction_override=None,
+                **kwargs) -> Tensor:
+        """Forward function.
+
+        Args:
+            pred (Tensor): Predictions of the segmentation head. (B, C, H, W)
+            target (Tensor): Ground truth of the image. (B, H, W)
+            avg_factor (int, optional): Average factor that is used to
+                average the loss. Defaults to None.
+            reduction_override (str, optional): The reduction method used
+                to override the original reduction method of the loss.
+                Options are "none", "mean" and "sum".
+        Returns:
+            Tensor: Loss tensor.
+        """
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+        if self.class_weight is not None:
+            class_weight = pred.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pred_soft = F.softmax(pred, dim=1)
+        valid_mask = (target != self.ignore_index).long()
+        target = target * valid_mask
+
+        with torch.no_grad():
+            gt_dtm = compute_dtm(target.cpu(), pred_soft)
+            gt_dtm = gt_dtm.float()
+            seg_dtm2 = compute_dtm(
+                pred_soft.argmax(dim=1, keepdim=False).cpu(), pred_soft)
+            seg_dtm2 = seg_dtm2.float()
+
+        loss_hd = self.loss_weight * hd_loss(
+            pred_soft,
+            target,
+            seg_dtm=seg_dtm2,
+            gt_dtm=gt_dtm,
+            reduction=reduction,
+            avg_factor=avg_factor,
+            class_weight=class_weight,
+            ignore_index=self.ignore_index)
+        return loss_hd
+
+    @property
+    def loss_name(self):
+        return self._loss_name
diff --git a/mmseg/models/losses/kldiv_loss.py b/mmseg/models/losses/kldiv_loss.py
new file mode 100644
index 0000000000..496ef9713f
--- /dev/null
+++ b/mmseg/models/losses/kldiv_loss.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class KLDivLoss(nn.Module):
+
+    def __init__(self,
+                 temperature: float = 1.0,
+                 reduction: str = 'mean',
+                 loss_name: str = 'loss_kld'):
+        """Kullback-Leibler divergence Loss.
+
+        <https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence>
+
+        Args:
+            temperature (float, optional): Temperature param
+            reduction  (str,  optional): The method to reduce the loss into a
+            scalar. Default is "mean". Options are "none", "sum",
+            and "mean"
+        """
+
+        assert isinstance(temperature, (float, int)), \
+            'Expected temperature to be' \
+            f'float or int, but got {temperature.__class__.__name__} instead'
+        assert temperature != 0., 'Temperature must not be zero'
+
+        assert reduction in ['mean', 'none', 'sum'], \
+            'Reduction must be one of the options ("mean", ' \
+            f'"sum", "none"), but got {reduction}'
+
+        super().__init__()
+        self.temperature = temperature
+        self.reduction = reduction
+        self._loss_name = loss_name
+
+    def forward(self, input: torch.Tensor, target: torch.Tensor):
+        """Forward function. Calculate KL divergence Loss.
+
+        Args:
+            input (Tensor): Logit tensor,
+                the data type is float32 or float64.
+                The shape is (N, C) where N is batchsize and C  is number of
+                channels.
+                If there more than 2 dimensions, shape is (N, C, D1, D2, ...
+                Dk), k>= 1
+            target (Tensor): Logit tensor,
+                the data type is float32 or float64.
+                input and target must be with the same shape.
+
+        Returns:
+            (Tensor): Reduced loss.
+        """
+        assert isinstance(input, torch.Tensor), 'Expected input to' \
+            f'be Tensor, but got {input.__class__.__name__} instead'
+        assert isinstance(target, torch.Tensor), 'Expected target to' \
+            f'be Tensor, but got {target.__class__.__name__} instead'
+
+        assert input.shape == target.shape, 'Input and target ' \
+            'must have same shape,' \
+            f'but got shapes {input.shape} and {target.shape}'
+
+        input = F.softmax(input / self.temperature, dim=1)
+        target = F.softmax(target / self.temperature, dim=1)
+
+        loss = F.kl_div(input, target, reduction='none', log_target=False)
+        loss = loss * self.temperature**2
+
+        batch_size = input.shape[0]
+
+        if self.reduction == 'sum':
+            # Change view to calculate instance-wise sum
+            loss = loss.view(batch_size, -1)
+            return torch.sum(loss, dim=1)
+
+        elif self.reduction == 'mean':
+            # Change view to calculate instance-wise mean
+            loss = loss.view(batch_size, -1)
+            return torch.mean(loss, dim=1)
+
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/mmseg/models/losses/lovasz_loss.py b/mmseg/models/losses/lovasz_loss.py
index 457a233164..b47f9d8a15 100644
--- a/mmseg/models/losses/lovasz_loss.py
+++ b/mmseg/models/losses/lovasz_loss.py
@@ -3,10 +3,10 @@
 ch/lovasz_losses.py Lovasz-Softmax and Jaccard hinge loss in PyTorch Maxim
 Berman 2018 ESAT-PSI KU Leuven (MIT License)"""
 
-import mmcv
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from mmengine.utils import is_list_of
 
 from mmseg.registry import MODELS
 from .utils import get_class_weight, weight_reduce_loss
@@ -257,7 +257,7 @@ def __init__(self,
                  class_weight=None,
                  loss_weight=1.0,
                  loss_name='loss_lovasz'):
-        super(LovaszLoss, self).__init__()
+        super().__init__()
         assert loss_type in ('binary', 'multi_class'), "loss_type should be \
                                                     'binary' or 'multi_class'."
 
@@ -265,7 +265,7 @@ def __init__(self,
             self.cls_criterion = lovasz_hinge
         else:
             self.cls_criterion = lovasz_softmax
-        assert classes in ('all', 'present') or mmcv.is_list_of(classes, int)
+        assert classes in ('all', 'present') or is_list_of(classes, int)
         if not per_image:
             assert reduction == 'none', "reduction should be 'none' when \
                                                         per_image is False."
diff --git a/mmseg/models/losses/ohem_cross_entropy_loss.py b/mmseg/models/losses/ohem_cross_entropy_loss.py
new file mode 100644
index 0000000000..a519b4d84e
--- /dev/null
+++ b/mmseg/models/losses/ohem_cross_entropy_loss.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional, Union
+
+import torch.nn as nn
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class OhemCrossEntropy(nn.Module):
+    """OhemCrossEntropy loss.
+
+    This func is modified from
+    `PIDNet <https://github.com/XuJiacong/PIDNet/blob/main/utils/criterion.py#L43>`_.  # noqa
+
+    Licensed under the MIT License.
+
+    Args:
+        ignore_label (int): Labels to ignore when computing the loss.
+            Default: 255
+        thresh (float, optional): The threshold for hard example selection.
+            Below which, are prediction with low confidence. If not
+            specified, the hard examples will be pixels of top ``min_kept``
+            loss. Default: 0.7.
+        min_kept (int, optional): The minimum number of predictions to keep.
+            Default: 100000.
+        loss_weight (float): Weight of the loss. Defaults to 1.0.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_name (str): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_boundary'.
+    """
+
+    def __init__(self,
+                 ignore_label: int = 255,
+                 thres: float = 0.7,
+                 min_kept: int = 100000,
+                 loss_weight: float = 1.0,
+                 class_weight: Optional[Union[List[float], str]] = None,
+                 loss_name: str = 'loss_ohem'):
+        super().__init__()
+        self.thresh = thres
+        self.min_kept = max(1, min_kept)
+        self.ignore_label = ignore_label
+        self.loss_weight = loss_weight
+        self.loss_name_ = loss_name
+        self.class_weight = class_weight
+
+    def forward(self, score: Tensor, target: Tensor) -> Tensor:
+        """Forward function.
+        Args:
+            score (Tensor): Predictions of the segmentation head.
+            target (Tensor): Ground truth of the image.
+
+        Returns:
+            Tensor: Loss tensor.
+        """
+        # score: (N, C, H, W)
+        pred = F.softmax(score, dim=1)
+        if self.class_weight is not None:
+            class_weight = score.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pixel_losses = F.cross_entropy(
+            score,
+            target,
+            weight=class_weight,
+            ignore_index=self.ignore_label,
+            reduction='none').contiguous().view(-1)  # (N*H*W)
+        mask = target.contiguous().view(-1) != self.ignore_label  # (N*H*W)
+
+        tmp_target = target.clone()  # (N, H, W)
+        tmp_target[tmp_target == self.ignore_label] = 0
+        # pred: (N, C, H, W) -> (N*H*W, C)
+        pred = pred.gather(1, tmp_target.unsqueeze(1))
+        # pred: (N*H*W, C) -> (N*H*W), ind: (N*H*W)
+        pred, ind = pred.contiguous().view(-1, )[mask].contiguous().sort()
+        if pred.numel() > 0:
+            min_value = pred[min(self.min_kept, pred.numel() - 1)]
+        else:
+            return score.new_tensor(0.0)
+        threshold = max(min_value, self.thresh)
+
+        pixel_losses = pixel_losses[mask][ind]
+        pixel_losses = pixel_losses[pred < threshold]
+        return self.loss_weight * pixel_losses.mean()
+
+    @property
+    def loss_name(self):
+        return self.loss_name_
diff --git a/mmseg/models/losses/silog_loss.py b/mmseg/models/losses/silog_loss.py
new file mode 100644
index 0000000000..ecc07aac42
--- /dev/null
+++ b/mmseg/models/losses/silog_loss.py
@@ -0,0 +1,122 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from .utils import weight_reduce_loss
+
+
+def silog_loss(pred: Tensor,
+               target: Tensor,
+               weight: Optional[Tensor] = None,
+               eps: float = 1e-4,
+               reduction: Union[str, None] = 'mean',
+               avg_factor: Optional[int] = None) -> Tensor:
+    """Computes the Scale-Invariant Logarithmic (SI-Log) loss between
+    prediction and target.
+
+    Args:
+        pred (Tensor): Predicted output.
+        target (Tensor): Ground truth.
+        weight (Optional[Tensor]): Optional weight to apply on the loss.
+        eps (float): Epsilon value to avoid division and log(0).
+        reduction (Union[str, None]): Specifies the reduction to apply to the
+            output: 'mean', 'sum' or None.
+        avg_factor (Optional[int]): Optional average factor for the loss.
+
+    Returns:
+        Tensor: The calculated SI-Log loss.
+    """
+    pred, target = pred.flatten(1), target.flatten(1)
+    valid_mask = (target > eps).detach().float()
+
+    diff_log = torch.log(target.clamp(min=eps)) - torch.log(
+        pred.clamp(min=eps))
+
+    valid_mask = (target > eps).detach() & (~torch.isnan(diff_log))
+    diff_log[~valid_mask] = 0.0
+    valid_mask = valid_mask.float()
+
+    diff_log_sq_mean = (diff_log.pow(2) * valid_mask).sum(
+        dim=1) / valid_mask.sum(dim=1).clamp(min=eps)
+    diff_log_mean = (diff_log * valid_mask).sum(dim=1) / valid_mask.sum(
+        dim=1).clamp(min=eps)
+
+    loss = torch.sqrt(diff_log_sq_mean - 0.5 * diff_log_mean.pow(2))
+
+    if weight is not None:
+        weight = weight.float()
+
+    loss = weight_reduce_loss(loss, weight, reduction, avg_factor)
+    return loss
+
+
+@MODELS.register_module()
+class SiLogLoss(nn.Module):
+    """Compute SiLog loss.
+
+    Args:
+        reduction (str, optional): The method used
+            to reduce the loss. Options are "none",
+            "mean" and "sum". Defaults to 'mean'.
+        loss_weight (float, optional): Weight of loss. Defaults to 1.0.
+        eps (float): Avoid dividing by zero. Defaults to 1e-3.
+        loss_name (str, optional): Name of the loss item. If you want this
+            loss item to be included into the backward graph, `loss_` must
+            be the prefix of the name. Defaults to 'loss_silog'.
+    """
+
+    def __init__(self,
+                 reduction='mean',
+                 loss_weight=1.0,
+                 eps=1e-6,
+                 loss_name='loss_silog'):
+        super().__init__()
+        self.reduction = reduction
+        self.loss_weight = loss_weight
+        self.eps = eps
+        self._loss_name = loss_name
+
+    def forward(
+        self,
+        pred,
+        target,
+        weight=None,
+        avg_factor=None,
+        reduction_override=None,
+    ):
+
+        assert pred.shape == target.shape, 'the shapes of pred ' \
+            f'({pred.shape}) and target ({target.shape}) are mismatch'
+
+        assert reduction_override in (None, 'none', 'mean', 'sum')
+        reduction = (
+            reduction_override if reduction_override else self.reduction)
+
+        loss = self.loss_weight * silog_loss(
+            pred,
+            target,
+            weight,
+            eps=self.eps,
+            reduction=reduction,
+            avg_factor=avg_factor,
+        )
+
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/mmseg/models/losses/tversky_loss.py b/mmseg/models/losses/tversky_loss.py
new file mode 100644
index 0000000000..bfca1af666
--- /dev/null
+++ b/mmseg/models/losses/tversky_loss.py
@@ -0,0 +1,137 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Modified from
+https://github.com/JunMa11/SegLoss/blob/master/losses_pytorch/dice_loss.py#L333
+(Apache-2.0 License)"""
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ..builder import LOSSES
+from .utils import get_class_weight, weighted_loss
+
+
+@weighted_loss
+def tversky_loss(pred,
+                 target,
+                 valid_mask,
+                 alpha=0.3,
+                 beta=0.7,
+                 smooth=1,
+                 class_weight=None,
+                 ignore_index=255):
+    assert pred.shape[0] == target.shape[0]
+    total_loss = 0
+    num_classes = pred.shape[1]
+    for i in range(num_classes):
+        if i != ignore_index:
+            tversky_loss = binary_tversky_loss(
+                pred[:, i],
+                target[..., i],
+                valid_mask=valid_mask,
+                alpha=alpha,
+                beta=beta,
+                smooth=smooth)
+            if class_weight is not None:
+                tversky_loss *= class_weight[i]
+            total_loss += tversky_loss
+    return total_loss / num_classes
+
+
+@weighted_loss
+def binary_tversky_loss(pred,
+                        target,
+                        valid_mask,
+                        alpha=0.3,
+                        beta=0.7,
+                        smooth=1):
+    assert pred.shape[0] == target.shape[0]
+    pred = pred.reshape(pred.shape[0], -1)
+    target = target.reshape(target.shape[0], -1)
+    valid_mask = valid_mask.reshape(valid_mask.shape[0], -1)
+
+    TP = torch.sum(torch.mul(pred, target) * valid_mask, dim=1)
+    FP = torch.sum(torch.mul(pred, 1 - target) * valid_mask, dim=1)
+    FN = torch.sum(torch.mul(1 - pred, target) * valid_mask, dim=1)
+    tversky = (TP + smooth) / (TP + alpha * FP + beta * FN + smooth)
+
+    return 1 - tversky
+
+
+@LOSSES.register_module()
+class TverskyLoss(nn.Module):
+    """TverskyLoss. This loss is proposed in `Tversky loss function for image
+    segmentation using 3D fully convolutional deep networks.
+
+    <https://arxiv.org/abs/1706.05721>`_.
+    Args:
+        smooth (float): A float number to smooth loss, and avoid NaN error.
+            Default: 1.
+        class_weight (list[float] | str, optional): Weight of each class. If in
+            str format, read them from a file. Defaults to None.
+        loss_weight (float, optional): Weight of the loss. Default to 1.0.
+        ignore_index (int | None): The label index to be ignored. Default: 255.
+        alpha(float, in [0, 1]):
+            The coefficient of false positives. Default: 0.3.
+        beta (float, in [0, 1]):
+            The coefficient of false negatives. Default: 0.7.
+            Note: alpha + beta = 1.
+        loss_name (str, optional): Name of the loss item. If you want this loss
+            item to be included into the backward graph, `loss_` must be the
+            prefix of the name. Defaults to 'loss_tversky'.
+    """
+
+    def __init__(self,
+                 smooth=1,
+                 class_weight=None,
+                 loss_weight=1.0,
+                 ignore_index=255,
+                 alpha=0.3,
+                 beta=0.7,
+                 loss_name='loss_tversky'):
+        super().__init__()
+        self.smooth = smooth
+        self.class_weight = get_class_weight(class_weight)
+        self.loss_weight = loss_weight
+        self.ignore_index = ignore_index
+        assert (alpha + beta == 1.0), 'Sum of alpha and beta but be 1.0!'
+        self.alpha = alpha
+        self.beta = beta
+        self._loss_name = loss_name
+
+    def forward(self, pred, target, **kwargs):
+        if self.class_weight is not None:
+            class_weight = pred.new_tensor(self.class_weight)
+        else:
+            class_weight = None
+
+        pred = F.softmax(pred, dim=1)
+        num_classes = pred.shape[1]
+        one_hot_target = F.one_hot(
+            torch.clamp(target.long(), 0, num_classes - 1),
+            num_classes=num_classes)
+        valid_mask = (target != self.ignore_index).long()
+
+        loss = self.loss_weight * tversky_loss(
+            pred,
+            one_hot_target,
+            valid_mask=valid_mask,
+            alpha=self.alpha,
+            beta=self.beta,
+            smooth=self.smooth,
+            class_weight=class_weight,
+            ignore_index=self.ignore_index)
+        return loss
+
+    @property
+    def loss_name(self):
+        """Loss Name.
+
+        This function must be implemented and will return the name of this
+        loss function. This name will be used to combine different loss items
+        by simple sum operation. In addition, if you want this loss item to be
+        included into the backward graph, `loss_` must be the prefix of the
+        name.
+        Returns:
+            str: The name of this loss item.
+        """
+        return self._loss_name
diff --git a/mmseg/models/losses/utils.py b/mmseg/models/losses/utils.py
index 621f57c746..0478034733 100644
--- a/mmseg/models/losses/utils.py
+++ b/mmseg/models/losses/utils.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import functools
 
-import mmcv
 import numpy as np
 import torch
 import torch.nn.functional as F
+from mmengine.fileio import load
 
 
 def get_class_weight(class_weight):
@@ -20,12 +20,12 @@ def get_class_weight(class_weight):
             class_weight = np.load(class_weight)
         else:
             # pkl, json or yaml
-            class_weight = mmcv.load(class_weight)
+            class_weight = load(class_weight)
 
     return class_weight
 
 
-def reduce_loss(loss, reduction):
+def reduce_loss(loss, reduction) -> torch.Tensor:
     """Reduce loss as specified.
 
     Args:
@@ -45,7 +45,10 @@ def reduce_loss(loss, reduction):
         return loss.sum()
 
 
-def weight_reduce_loss(loss, weight=None, reduction='mean', avg_factor=None):
+def weight_reduce_loss(loss,
+                       weight=None,
+                       reduction='mean',
+                       avg_factor=None) -> torch.Tensor:
     """Apply element-wise weight and reduce loss.
 
     Args:
diff --git a/mmseg/models/necks/featurepyramid.py b/mmseg/models/necks/featurepyramid.py
index 40453653dc..dc1250d39d 100644
--- a/mmseg/models/necks/featurepyramid.py
+++ b/mmseg/models/necks/featurepyramid.py
@@ -23,7 +23,7 @@ def __init__(self,
                  embed_dim,
                  rescales=[4, 2, 1, 0.5],
                  norm_cfg=dict(type='SyncBN', requires_grad=True)):
-        super(Feature2Pyramid, self).__init__()
+        super().__init__()
         self.rescales = rescales
         self.upsample_4x = None
         for k in self.rescales:
diff --git a/mmseg/models/necks/fpn.py b/mmseg/models/necks/fpn.py
index ee0e232403..ddab74c00a 100644
--- a/mmseg/models/necks/fpn.py
+++ b/mmseg/models/necks/fpn.py
@@ -2,10 +2,10 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule, auto_fp16
+from mmengine.model import BaseModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 
 
 @MODELS.register_module()
@@ -80,7 +80,7 @@ def __init__(self,
                  upsample_cfg=dict(mode='nearest'),
                  init_cfg=dict(
                      type='Xavier', layer='Conv2d', distribution='uniform')):
-        super(FPN, self).__init__(init_cfg)
+        super().__init__(init_cfg)
         assert isinstance(in_channels, list)
         self.in_channels = in_channels
         self.out_channels = out_channels
@@ -159,7 +159,6 @@ def __init__(self,
                     inplace=False)
                 self.fpn_convs.append(extra_fpn_conv)
 
-    @auto_fp16()
     def forward(self, inputs):
         assert len(inputs) == len(self.in_channels)
 
diff --git a/mmseg/models/necks/ic_neck.py b/mmseg/models/necks/ic_neck.py
index 973683c2be..9763541e09 100644
--- a/mmseg/models/necks/ic_neck.py
+++ b/mmseg/models/necks/ic_neck.py
@@ -1,10 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch.nn.functional as F
 from mmcv.cnn import ConvModule
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 
 
 class CascadeFeatureFusion(BaseModule):
@@ -42,7 +42,7 @@ def __init__(self,
                  act_cfg=dict(type='ReLU'),
                  align_corners=False,
                  init_cfg=None):
-        super(CascadeFeatureFusion, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         self.align_corners = align_corners
         self.conv_low = ConvModule(
             low_channels,
@@ -108,7 +108,7 @@ def __init__(self,
                  act_cfg=dict(type='ReLU'),
                  align_corners=False,
                  init_cfg=None):
-        super(ICNeck, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert len(in_channels) == 3, 'Length of input channels \
                                         must be 3!'
 
diff --git a/mmseg/models/necks/jpu.py b/mmseg/models/necks/jpu.py
index 9de2435f9b..3ea0fe2183 100644
--- a/mmseg/models/necks/jpu.py
+++ b/mmseg/models/necks/jpu.py
@@ -2,10 +2,10 @@
 import torch
 import torch.nn as nn
 from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
-from mmcv.runner import BaseModule
+from mmengine.model import BaseModule
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 
 
 @MODELS.register_module()
@@ -51,7 +51,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  init_cfg=None):
-        super(JPU, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
         assert isinstance(in_channels, tuple)
         assert isinstance(dilations, tuple)
         self.in_channels = in_channels
diff --git a/mmseg/models/necks/mla_neck.py b/mmseg/models/necks/mla_neck.py
index 64a4092392..db250aefbf 100644
--- a/mmseg/models/necks/mla_neck.py
+++ b/mmseg/models/necks/mla_neck.py
@@ -12,7 +12,7 @@ def __init__(self,
                  out_channels=256,
                  norm_cfg=None,
                  act_cfg=None):
-        super(MLAModule, self).__init__()
+        super().__init__()
         self.channel_proj = nn.ModuleList()
         for i in range(len(in_channels)):
             self.channel_proj.append(
@@ -83,7 +83,7 @@ def __init__(self,
                  norm_layer=dict(type='LN', eps=1e-6, requires_grad=True),
                  norm_cfg=None,
                  act_cfg=None):
-        super(MLANeck, self).__init__()
+        super().__init__()
         assert isinstance(in_channels, list)
         self.in_channels = in_channels
         self.out_channels = out_channels
diff --git a/mmseg/models/necks/multilevel_neck.py b/mmseg/models/necks/multilevel_neck.py
index 14942bb63a..c997125f24 100644
--- a/mmseg/models/necks/multilevel_neck.py
+++ b/mmseg/models/necks/multilevel_neck.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch.nn as nn
-from mmcv.cnn import ConvModule, xavier_init
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import xavier_init
 
-from mmseg.ops import resize
 from mmseg.registry import MODELS
+from ..utils import resize
 
 
 @MODELS.register_module()
@@ -28,7 +29,7 @@ def __init__(self,
                  scales=[0.5, 1, 2, 4],
                  norm_cfg=None,
                  act_cfg=None):
-        super(MultiLevelNeck, self).__init__()
+        super().__init__()
         assert isinstance(in_channels, list)
         self.in_channels = in_channels
         self.out_channels = out_channels
diff --git a/mmseg/models/segmentors/__init__.py b/mmseg/models/segmentors/__init__.py
index 387c858bd7..59b012f417 100644
--- a/mmseg/models/segmentors/__init__.py
+++ b/mmseg/models/segmentors/__init__.py
@@ -1,6 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from .base import BaseSegmentor
 from .cascade_encoder_decoder import CascadeEncoderDecoder
+from .depth_estimator import DepthEstimator
 from .encoder_decoder import EncoderDecoder
+from .multimodal_encoder_decoder import MultimodalEncoderDecoder
+from .seg_tta import SegTTAModel
 
-__all__ = ['BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder']
+__all__ = [
+    'BaseSegmentor', 'EncoderDecoder', 'CascadeEncoderDecoder', 'SegTTAModel',
+    'MultimodalEncoderDecoder', 'DepthEstimator'
+]
diff --git a/mmseg/models/segmentors/base.py b/mmseg/models/segmentors/base.py
index 1798c9386b..17a0bb2b33 100644
--- a/mmseg/models/segmentors/base.py
+++ b/mmseg/models/segmentors/base.py
@@ -2,14 +2,14 @@
 from abc import ABCMeta, abstractmethod
 from typing import List, Tuple
 
-from mmengine.data import PixelData
 from mmengine.model import BaseModel
+from mmengine.structures import PixelData
 from torch import Tensor
 
-from mmseg.data import SegDataSample
-from mmseg.ops import resize
+from mmseg.structures import SegDataSample
 from mmseg.utils import (ForwardResults, OptConfigType, OptMultiConfig,
                          OptSampleList, SampleList)
+from ..utils import resize
 
 
 class BaseSegmentor(BaseModel, metaclass=ABCMeta):
@@ -27,7 +27,7 @@ class BaseSegmentor(BaseModel, metaclass=ABCMeta):
     def __init__(self,
                  data_preprocessor: OptConfigType = None,
                  init_cfg: OptMultiConfig = None):
-        super(BaseSegmentor, self).__init__(
+        super().__init__(
             data_preprocessor=data_preprocessor, init_cfg=init_cfg)
 
     @property
@@ -47,20 +47,19 @@ def with_decode_head(self) -> bool:
         return hasattr(self, 'decode_head') and self.decode_head is not None
 
     @abstractmethod
-    def extract_feat(self, batch_inputs: Tensor) -> bool:
+    def extract_feat(self, inputs: Tensor) -> bool:
         """Placeholder for extract features from images."""
         pass
 
     @abstractmethod
-    def encode_decode(self, batch_inputs: Tensor,
-                      batch_data_samples: SampleList):
+    def encode_decode(self, inputs: Tensor, batch_data_samples: SampleList):
         """Placeholder for encode images with backbone and decode into a
         semantic segmentation map of the same size as input."""
         pass
 
     def forward(self,
-                batch_inputs: Tensor,
-                batch_data_samples: OptSampleList = None,
+                inputs: Tensor,
+                data_samples: OptSampleList = None,
                 mode: str = 'tensor') -> ForwardResults:
         """The unified entry for a forward process in both training and test.
 
@@ -77,10 +76,11 @@ def forward(self,
         optimizer updating, which are done in the :meth:`train_step`.
 
         Args:
-            batch_inputs (torch.Tensor): The input tensor with shape
-                (N, C, ...) in general.
-            batch_data_samples (list[:obj:`SegDataSample`], optional): The
-                annotation data of every samples. Defaults to None.
+            inputs (torch.Tensor): The input tensor with shape (N, C, ...) in
+                general.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`. Default to None.
             mode (str): Return what kind of value. Defaults to 'tensor'.
 
         Returns:
@@ -91,33 +91,32 @@ def forward(self,
             - If ``mode="loss"``, return a dict of tensor.
         """
         if mode == 'loss':
-            return self.loss(batch_inputs, batch_data_samples)
+            return self.loss(inputs, data_samples)
         elif mode == 'predict':
-            return self.predict(batch_inputs, batch_data_samples)
+            return self.predict(inputs, data_samples)
         elif mode == 'tensor':
-            return self._forward(batch_inputs, batch_data_samples)
+            return self._forward(inputs, data_samples)
         else:
             raise RuntimeError(f'Invalid mode "{mode}". '
                                'Only supports loss, predict and tensor mode')
 
     @abstractmethod
-    def loss(self, batch_inputs: Tensor,
-             batch_data_samples: SampleList) -> dict:
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
         """Calculate losses from a batch of inputs and data samples."""
         pass
 
     @abstractmethod
-    def predict(self, batch_inputs: Tensor,
-                batch_data_samples: SampleList) -> SampleList:
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
         """Predict results from a batch of inputs and data samples with post-
         processing."""
         pass
 
     @abstractmethod
-    def _forward(
-            self,
-            batch_inputs: Tensor,
-            batch_data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tuple[List[Tensor]]:
         """Network forward process.
 
         Usually includes backbone, neck and head forward without any post-
@@ -125,18 +124,16 @@ def _forward(
         """
         pass
 
-    @abstractmethod
-    def aug_test(self, batch_inputs, batch_img_metas):
-        """Placeholder for augmentation test."""
-        pass
-
-    def postprocess_result(self, seg_logits_list: List[dict],
-                           batch_img_metas: List[dict]) -> list:
+    def postprocess_result(self,
+                           seg_logits: Tensor,
+                           data_samples: OptSampleList = None) -> SampleList:
         """ Convert results list to `SegDataSample`.
         Args:
-            seg_logits_list (List[dict]): List of segmentation results,
-                seg_logits from model of each input image.
-
+            seg_logits (Tensor): The segmentation results, seg_logits from
+                model of each input image.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`. Default to None.
         Returns:
             list[:obj:`SegDataSample`]: Segmentation results of the
             input images. Each SegDataSample usually contain:
@@ -145,22 +142,59 @@ def postprocess_result(self, seg_logits_list: List[dict],
             - ``seg_logits``(PixelData): Predicted logits of semantic
                 segmentation before normalization.
         """
-        predictions = []
-
-        for i in range(len(seg_logits_list)):
-            img_meta = batch_img_metas[i]
-            seg_logits = resize(
-                seg_logits_list[i][None],
-                size=img_meta['ori_shape'],
-                mode='bilinear',
-                align_corners=self.align_corners,
-                warning=False).squeeze(0)
-            # seg_logits shape is CHW
-            seg_pred = seg_logits.argmax(dim=0, keepdim=True)
-            prediction = SegDataSample(**{'metainfo': img_meta})
-            prediction.set_data({
-                'seg_logits': PixelData(**{'data': seg_logits}),
-                'pred_sem_seg': PixelData(**{'data': seg_pred})
+        batch_size, C, H, W = seg_logits.shape
+
+        if data_samples is None:
+            data_samples = [SegDataSample() for _ in range(batch_size)]
+            only_prediction = True
+        else:
+            only_prediction = False
+
+        for i in range(batch_size):
+            if not only_prediction:
+                img_meta = data_samples[i].metainfo
+                # remove padding area
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
+                # i_seg_logits shape is 1, C, H, W after remove padding
+                i_seg_logits = seg_logits[i:i + 1, :,
+                                          padding_top:H - padding_bottom,
+                                          padding_left:W - padding_right]
+
+                flip = img_meta.get('flip', None)
+                if flip:
+                    flip_direction = img_meta.get('flip_direction', None)
+                    assert flip_direction in ['horizontal', 'vertical']
+                    if flip_direction == 'horizontal':
+                        i_seg_logits = i_seg_logits.flip(dims=(3, ))
+                    else:
+                        i_seg_logits = i_seg_logits.flip(dims=(2, ))
+
+                # resize as original shape
+                i_seg_logits = resize(
+                    i_seg_logits,
+                    size=img_meta['ori_shape'],
+                    mode='bilinear',
+                    align_corners=self.align_corners,
+                    warning=False).squeeze(0)
+            else:
+                i_seg_logits = seg_logits[i]
+
+            if C > 1:
+                i_seg_pred = i_seg_logits.argmax(dim=0, keepdim=True)
+            else:
+                i_seg_logits = i_seg_logits.sigmoid()
+                i_seg_pred = (i_seg_logits >
+                              self.decode_head.threshold).to(i_seg_logits)
+            data_samples[i].set_data({
+                'seg_logits':
+                PixelData(**{'data': i_seg_logits}),
+                'pred_sem_seg':
+                PixelData(**{'data': i_seg_pred})
             })
-            predictions.append(prediction)
-        return predictions
+
+        return data_samples
diff --git a/mmseg/models/segmentors/cascade_encoder_decoder.py b/mmseg/models/segmentors/cascade_encoder_decoder.py
index 2d85b6ad16..0184a3533a 100644
--- a/mmseg/models/segmentors/cascade_encoder_decoder.py
+++ b/mmseg/models/segmentors/cascade_encoder_decoder.py
@@ -48,7 +48,7 @@ def __init__(self,
                  pretrained: Optional[str] = None,
                  init_cfg: OptMultiConfig = None):
         self.num_stages = num_stages
-        super(CascadeEncoderDecoder, self).__init__(
+        super().__init__(
             backbone=backbone,
             decode_head=decode_head,
             neck=neck,
@@ -68,12 +68,13 @@ def _init_decode_head(self, decode_head: ConfigType) -> None:
             self.decode_head.append(MODELS.build(decode_head[i]))
         self.align_corners = self.decode_head[-1].align_corners
         self.num_classes = self.decode_head[-1].num_classes
+        self.out_channels = self.decode_head[-1].out_channels
 
-    def encode_decode(self, batch_inputs: Tensor,
-                      batch_img_metas: List[dict]) -> List[Tensor]:
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
         """Encode images with backbone and decode into a semantic segmentation
         map of the same size as input."""
-        x = self.extract_feat(batch_inputs)
+        x = self.extract_feat(inputs)
         out = self.decode_head[0].forward(x)
         for i in range(1, self.num_stages - 1):
             out = self.decode_head[i].forward(x, out)
@@ -82,53 +83,52 @@ def encode_decode(self, batch_inputs: Tensor,
 
         return seg_logits_list
 
-    def _decode_head_forward_train(self, batch_inputs: Tensor,
-                                   batch_data_samples: SampleList) -> dict:
+    def _decode_head_forward_train(self, inputs: Tensor,
+                                   data_samples: SampleList) -> dict:
         """Run forward function and calculate loss for decode head in
         training."""
         losses = dict()
 
-        loss_decode = self.decode_head[0].loss(batch_inputs,
-                                               batch_data_samples,
+        loss_decode = self.decode_head[0].loss(inputs, data_samples,
                                                self.train_cfg)
 
         losses.update(add_prefix(loss_decode, 'decode_0'))
         # get batch_img_metas
-        batch_size = len(batch_data_samples)
+        batch_size = len(data_samples)
         batch_img_metas = []
         for batch_index in range(batch_size):
-            metainfo = batch_data_samples[batch_index].metainfo
+            metainfo = data_samples[batch_index].metainfo
             batch_img_metas.append(metainfo)
 
         for i in range(1, self.num_stages):
             # forward test again, maybe unnecessary for most methods.
             if i == 1:
-                prev_outputs = self.decode_head[0].forward(batch_inputs)
+                prev_outputs = self.decode_head[0].forward(inputs)
             else:
                 prev_outputs = self.decode_head[i - 1].forward(
-                    batch_inputs, prev_outputs)
-            loss_decode = self.decode_head[i].loss(batch_inputs, prev_outputs,
-                                                   batch_data_samples,
+                    inputs, prev_outputs)
+            loss_decode = self.decode_head[i].loss(inputs, prev_outputs,
+                                                   data_samples,
                                                    self.train_cfg)
             losses.update(add_prefix(loss_decode, f'decode_{i}'))
 
         return losses
 
     def _forward(self,
-                 batch_inputs: Tensor,
+                 inputs: Tensor,
                  data_samples: OptSampleList = None) -> Tensor:
         """Network forward process.
 
         Args:
-            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
-            batch_data_samples (List[:obj:`SegDataSample`]): The seg
-                data samples. It usually includes information such
-                as `img_metas` and `gt_semantic_seg`.
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_semantic_seg`.
 
         Returns:
             Tensor: Forward output of model without any post-processes.
         """
-        x = self.extract_feat(batch_inputs)
+        x = self.extract_feat(inputs)
 
         out = self.decode_head[0].forward(x)
         for i in range(1, self.num_stages):
diff --git a/mmseg/models/segmentors/depth_estimator.py b/mmseg/models/segmentors/depth_estimator.py
new file mode 100644
index 0000000000..1020637e73
--- /dev/null
+++ b/mmseg/models/segmentors/depth_estimator.py
@@ -0,0 +1,392 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import logging
+from typing import List, Optional
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmengine.logging import print_log
+from mmengine.structures import PixelData
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from ..utils import resize
+from .encoder_decoder import EncoderDecoder
+
+
+@MODELS.register_module()
+class DepthEstimator(EncoderDecoder):
+    """Encoder Decoder depth estimator.
+
+    EncoderDecoder typically consists of backbone, decode_head, auxiliary_head.
+    Note that auxiliary_head is only used for deep supervision during training,
+    which could be dumped during inference.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train() -> _auxiliary_head_forward_train (optional)
+     _decode_head_forward_train(): decode_head.loss()
+     _auxiliary_head_forward_train(): auxiliary_head.loss (optional)
+
+    2. The ``predict`` method is used to predict depth estimation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    depth (2) Call post-processing function to obtain list of
+    ``SegDataSample`` including ``pred_depth_map``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     inference(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        backbone (ConfigType): The config for the backnone of depth estimator.
+        decode_head (ConfigType): The config for the decode head of depth estimator.
+        neck (OptConfigType): The config for the neck of depth estimator.
+            Defaults to None.
+        auxiliary_head (OptConfigType): The config for the auxiliary head of
+            depth estimator. Defaults to None.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 backbone: ConfigType,
+                 decode_head: ConfigType,
+                 neck: OptConfigType = None,
+                 auxiliary_head: OptConfigType = None,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            backbone=backbone,
+            decode_head=decode_head,
+            neck=neck,
+            auxiliary_head=auxiliary_head,
+            train_cfg=train_cfg,
+            test_cfg=test_cfg,
+            data_preprocessor=data_preprocessor,
+            pretrained=pretrained,
+            init_cfg=init_cfg)
+
+    def extract_feat(self,
+                     inputs: Tensor,
+                     batch_img_metas: Optional[List[dict]] = None) -> Tensor:
+        """Extract features from images."""
+
+        if getattr(self.backbone, 'class_embed_select', False) and \
+                isinstance(batch_img_metas, list) and \
+                'category_id' in batch_img_metas[0]:
+            cat_ids = [meta['category_id'] for meta in batch_img_metas]
+            cat_ids = torch.tensor(cat_ids).to(inputs.device)
+            inputs = (inputs, cat_ids)
+
+        x = self.backbone(inputs)
+        if self.with_neck:
+            x = self.neck(x)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode images with backbone and decode into a depth map of the same
+        size as input."""
+        x = self.extract_feat(inputs, batch_img_metas)
+        depth = self.decode_head.predict(x, batch_img_metas, self.test_cfg)
+
+        return depth
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def _auxiliary_head_forward_train(self, inputs: List[Tensor],
+                                      data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for auxiliary head in
+        training."""
+        losses = dict()
+        if isinstance(self.auxiliary_head, nn.ModuleList):
+            for idx, aux_head in enumerate(self.auxiliary_head):
+                loss_aux = aux_head.loss(inputs, data_samples, self.train_cfg)
+                losses.update(add_prefix(loss_aux, f'aux_{idx}'))
+        else:
+            loss_aux = self.auxiliary_head.loss(inputs, data_samples,
+                                                self.train_cfg)
+            losses.update(add_prefix(loss_aux, 'aux'))
+
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_depth_map`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        x = self.extract_feat(inputs, batch_img_metas)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(x, data_samples)
+        losses.update(loss_decode)
+
+        if self.with_auxiliary_head:
+            loss_aux = self._auxiliary_head_forward_train(x, data_samples)
+            losses.update(loss_aux)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_depth_map`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Depth estimation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_depth_max``(PixelData): Prediction of depth estimation.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        depth = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(depth, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_depth_map`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_flip_inference(self, inputs: Tensor,
+                             batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap and flip.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The depth estimation results.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is depth tensor map
+                # with shape [N, C, H, W]
+                crop_depth_map = self.encode_decode(crop_img, batch_img_metas)
+
+                # average out the original and flipped prediction
+                crop_depth_map_flip = self.encode_decode(
+                    crop_img.flip(dims=(3, )), batch_img_metas)
+                crop_depth_map_flip = crop_depth_map_flip.flip(dims=(3, ))
+                crop_depth_map = (crop_depth_map + crop_depth_map_flip) / 2.0
+
+                preds += F.pad(crop_depth_map,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        depth = preds / count_mat
+
+        return depth
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The depth estimation results.
+        """
+        assert self.test_cfg.get('mode', 'whole') in ['slide', 'whole',
+                                                      'slide_flip'], \
+            f'Only "slide", "slide_flip" or "whole" test mode are ' \
+            f'supported, but got {self.test_cfg["mode"]}.'
+        ori_shape = batch_img_metas[0]['ori_shape']
+        if not all(_['ori_shape'] == ori_shape for _ in batch_img_metas):
+            print_log(
+                'Image shapes are different in the batch.',
+                logger='current',
+                level=logging.WARN)
+        if self.test_cfg.mode == 'slide':
+            depth_map = self.slide_inference(inputs, batch_img_metas)
+        if self.test_cfg.mode == 'slide_flip':
+            depth_map = self.slide_flip_inference(inputs, batch_img_metas)
+        else:
+            depth_map = self.whole_inference(inputs, batch_img_metas)
+
+        return depth_map
+
+    def postprocess_result(self,
+                           depth: Tensor,
+                           data_samples: OptSampleList = None) -> SampleList:
+        """ Convert results list to `SegDataSample`.
+        Args:
+            depth (Tensor): The depth estimation results.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_depth_map`. Default to None.
+        Returns:
+            list[:obj:`SegDataSample`]: Depth estomation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_depth_map``(PixelData): Prediction of depth estimation.
+        """
+        batch_size, C, H, W = depth.shape
+
+        if data_samples is None:
+            data_samples = [SegDataSample() for _ in range(batch_size)]
+            only_prediction = True
+        else:
+            only_prediction = False
+
+        for i in range(batch_size):
+            if not only_prediction:
+                img_meta = data_samples[i].metainfo
+                # remove padding area
+                if 'img_padding_size' not in img_meta:
+                    padding_size = img_meta.get('padding_size', [0] * 4)
+                else:
+                    padding_size = img_meta['img_padding_size']
+                padding_left, padding_right, padding_top, padding_bottom =\
+                    padding_size
+                # i_depth shape is 1, C, H, W after remove padding
+                i_depth = depth[i:i + 1, :, padding_top:H - padding_bottom,
+                                padding_left:W - padding_right]
+
+                flip = img_meta.get('flip', None)
+                if flip:
+                    flip_direction = img_meta.get('flip_direction', None)
+                    assert flip_direction in ['horizontal', 'vertical']
+                    if flip_direction == 'horizontal':
+                        i_depth = i_depth.flip(dims=(3, ))
+                    else:
+                        i_depth = i_depth.flip(dims=(2, ))
+
+                # resize as original shape
+                i_depth = resize(
+                    i_depth,
+                    size=img_meta['ori_shape'],
+                    mode='bilinear',
+                    align_corners=self.align_corners,
+                    warning=False).squeeze(0)
+            else:
+                i_depth = depth[i]
+
+            data_samples[i].set_data(
+                {'pred_depth_map': PixelData(**{'data': i_depth})})
+
+        return data_samples
diff --git a/mmseg/models/segmentors/encoder_decoder.py b/mmseg/models/segmentors/encoder_decoder.py
index f6024fc192..fa4050e0b7 100644
--- a/mmseg/models/segmentors/encoder_decoder.py
+++ b/mmseg/models/segmentors/encoder_decoder.py
@@ -1,9 +1,10 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import logging
 from typing import List, Optional
 
-import torch
 import torch.nn as nn
 import torch.nn.functional as F
+from mmengine.logging import print_log
 from torch import Tensor
 
 from mmseg.registry import MODELS
@@ -34,7 +35,7 @@ class EncoderDecoder(BaseSegmentor):
     2. The ``predict`` method is used to predict segmentation results,
     which includes two steps: (1) Run inference function to obtain the list of
     seg_logits (2) Call post-processing function to obtain list of
-    ``SegDataSampel`` including ``pred_sem_seg`` and ``seg_logits``.
+    ``SegDataSample`` including ``pred_sem_seg`` and ``seg_logits``.
 
     .. code:: text
 
@@ -79,7 +80,7 @@ def __init__(self,
                  data_preprocessor: OptConfigType = None,
                  pretrained: Optional[str] = None,
                  init_cfg: OptMultiConfig = None):
-        super(EncoderDecoder, self).__init__(
+        super().__init__(
             data_preprocessor=data_preprocessor, init_cfg=init_cfg)
         if pretrained is not None:
             assert backbone.get('pretrained') is None, \
@@ -101,6 +102,7 @@ def _init_decode_head(self, decode_head: ConfigType) -> None:
         self.decode_head = MODELS.build(decode_head)
         self.align_corners = self.decode_head.align_corners
         self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
 
     def _init_auxiliary_head(self, auxiliary_head: ConfigType) -> None:
         """Initialize ``auxiliary_head``"""
@@ -112,93 +114,87 @@ def _init_auxiliary_head(self, auxiliary_head: ConfigType) -> None:
             else:
                 self.auxiliary_head = MODELS.build(auxiliary_head)
 
-    def extract_feat(self, batch_inputs: Tensor) -> List[Tensor]:
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
         """Extract features from images."""
-        x = self.backbone(batch_inputs)
+        x = self.backbone(inputs)
         if self.with_neck:
             x = self.neck(x)
         return x
 
-    def encode_decode(self, batch_inputs: Tensor,
-                      batch_img_metas: List[dict]) -> List[Tensor]:
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
         """Encode images with backbone and decode into a semantic segmentation
         map of the same size as input."""
-        x = self.extract_feat(batch_inputs)
+        x = self.extract_feat(inputs)
         seg_logits = self.decode_head.predict(x, batch_img_metas,
                                               self.test_cfg)
 
-        return list(seg_logits)
+        return seg_logits
 
-    def _decode_head_forward_train(self, batch_inputs: List[Tensor],
-                                   batch_data_samples: SampleList) -> dict:
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
         """Run forward function and calculate loss for decode head in
         training."""
         losses = dict()
-        loss_decode = self.decode_head.loss(batch_inputs, batch_data_samples,
+        loss_decode = self.decode_head.loss(inputs, data_samples,
                                             self.train_cfg)
 
         losses.update(add_prefix(loss_decode, 'decode'))
         return losses
 
-    def _auxiliary_head_forward_train(
-        self,
-        batch_inputs: List[Tensor],
-        batch_data_samples: SampleList,
-    ) -> dict:
+    def _auxiliary_head_forward_train(self, inputs: List[Tensor],
+                                      data_samples: SampleList) -> dict:
         """Run forward function and calculate loss for auxiliary head in
         training."""
         losses = dict()
         if isinstance(self.auxiliary_head, nn.ModuleList):
             for idx, aux_head in enumerate(self.auxiliary_head):
-                loss_aux = aux_head.loss(batch_inputs, batch_data_samples,
-                                         self.train_cfg)
+                loss_aux = aux_head.loss(inputs, data_samples, self.train_cfg)
                 losses.update(add_prefix(loss_aux, f'aux_{idx}'))
         else:
-            loss_aux = self.auxiliary_head.loss(batch_inputs,
-                                                batch_data_samples,
+            loss_aux = self.auxiliary_head.loss(inputs, data_samples,
                                                 self.train_cfg)
             losses.update(add_prefix(loss_aux, 'aux'))
 
         return losses
 
-    def loss(self, batch_inputs: Tensor,
-             batch_data_samples: SampleList) -> dict:
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
         """Calculate losses from a batch of inputs and data samples.
 
         Args:
-            img (Tensor): Input images.
-            batch_data_samples (list[:obj:`SegDataSample`]): The seg
-                data samples. It usually includes information such
-                as `metainfo` and `gt_sem_seg`.
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`.
 
         Returns:
             dict[str, Tensor]: a dictionary of loss components
         """
 
-        x = self.extract_feat(batch_inputs)
+        x = self.extract_feat(inputs)
 
         losses = dict()
 
-        loss_decode = self._decode_head_forward_train(x, batch_data_samples)
+        loss_decode = self._decode_head_forward_train(x, data_samples)
         losses.update(loss_decode)
 
         if self.with_auxiliary_head:
-            loss_aux = self._auxiliary_head_forward_train(
-                x, batch_data_samples)
+            loss_aux = self._auxiliary_head_forward_train(x, data_samples)
             losses.update(loss_aux)
 
         return losses
 
-    def predict(self, batch_inputs: Tensor,
-                batch_data_samples: SampleList) -> SampleList:
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
         """Predict results from a batch of inputs and data samples with post-
         processing.
 
         Args:
-            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
-            batch_data_samples (List[:obj:`SegDataSample`]): The seg
-                data samples. It usually includes information such
-                as `metainfo` and `gt_sem_seg`.
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_sem_seg`.
 
         Returns:
             list[:obj:`SegDataSample`]: Segmentation results of the
@@ -208,40 +204,49 @@ def predict(self, batch_inputs: Tensor,
             - ``seg_logits``(PixelData): Predicted logits of semantic
                 segmentation before normalization.
         """
-        batch_img_metas = []
-        for data_sample in batch_data_samples:
-            batch_img_metas.append(data_sample.metainfo)
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
 
-        seg_logit_list = self.inference(batch_inputs, batch_img_metas)
+        seg_logits = self.inference(inputs, batch_img_metas)
 
-        return self.postprocess_result(seg_logit_list, batch_img_metas)
+        return self.postprocess_result(seg_logits, data_samples)
 
     def _forward(self,
-                 batch_inputs: Tensor,
+                 inputs: Tensor,
                  data_samples: OptSampleList = None) -> Tensor:
         """Network forward process.
 
         Args:
-            batch_inputs (Tensor): Inputs with shape (N, C, H, W).
-            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
                 data samples. It usually includes information such
                 as `metainfo` and `gt_sem_seg`.
 
         Returns:
             Tensor: Forward output of model without any post-processes.
         """
-        x = self.extract_feat(batch_inputs)
+        x = self.extract_feat(inputs)
         return self.decode_head.forward(x)
 
-    def slide_inference(self, batch_inputs: Tensor,
-                        batch_img_metas: List[dict]) -> List[Tensor]:
+    def slide_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
         """Inference by sliding-window with overlap.
 
         If h_crop > h_img or w_crop > w_img, the small patch will be used to
         decode without padding.
 
         Args:
-            batch_inputs (tensor): the tensor should have a shape NxCxHxW,
+            inputs (tensor): the tensor should have a shape NxCxHxW,
                 which contains all images in the batch.
             batch_img_metas (List[dict]): List of image metainfo where each may
                 also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
@@ -250,18 +255,18 @@ def slide_inference(self, batch_inputs: Tensor,
                 `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
 
         Returns:
-            List[:obj:`Tensor`]: List of segmentation results, seg_logits from
-                model of each input image.
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
         """
 
         h_stride, w_stride = self.test_cfg.stride
         h_crop, w_crop = self.test_cfg.crop_size
-        batch_size, _, h_img, w_img = batch_inputs.size()
-        num_classes = self.num_classes
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
         h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
         w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
-        preds = batch_inputs.new_zeros((batch_size, num_classes, h_img, w_img))
-        count_mat = batch_inputs.new_zeros((batch_size, 1, h_img, w_img))
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
         for h_idx in range(h_grids):
             for w_idx in range(w_grids):
                 y1 = h_idx * h_stride
@@ -270,30 +275,29 @@ def slide_inference(self, batch_inputs: Tensor,
                 x2 = min(x1 + w_crop, w_img)
                 y1 = max(y2 - h_crop, 0)
                 x1 = max(x2 - w_crop, 0)
-                crop_img = batch_inputs[:, :, y1:y2, x1:x2]
-                # change the img shape to patch shape
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
                 batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
-                # the output of encode_decode is list of seg logits map
-                # with shape [C, H, W]
-                crop_seg_logit = torch.stack(
-                    self.encode_decode(crop_img, batch_img_metas), dim=0)
+                # the output of encode_decode is seg logits tensor map
+                # with shape [N, C, H, W]
+                crop_seg_logit = self.encode_decode(crop_img, batch_img_metas)
                 preds += F.pad(crop_seg_logit,
                                (int(x1), int(preds.shape[3] - x2), int(y1),
                                 int(preds.shape[2] - y2)))
 
                 count_mat[:, :, y1:y2, x1:x2] += 1
         assert (count_mat == 0).sum() == 0
-        seg_logits_list = list(preds / count_mat)
+        seg_logits = preds / count_mat
 
-        return seg_logits_list
+        return seg_logits
 
-    def whole_inference(self, batch_inputs: Tensor,
-                        batch_img_metas: List[dict]) -> List[Tensor]:
+    def whole_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
         """Inference with full image.
 
         Args:
-            batch_inputs (Tensor): The tensor should have a shape NxCxHxW,
-                which contains all images in the batch.
+            inputs (Tensor): The tensor should have a shape NxCxHxW, which
+                contains all images in the batch.
             batch_img_metas (List[dict]): List of image metainfo where each may
                 also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
                 'ori_shape', and 'pad_shape'.
@@ -301,44 +305,46 @@ def whole_inference(self, batch_inputs: Tensor,
                 `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
 
         Returns:
-            List[:obj:`Tensor`]: List of segmentation results, seg_logits from
-                model of each input image.
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
         """
 
-        seg_logits_list = self.encode_decode(batch_inputs, batch_img_metas)
+        seg_logits = self.encode_decode(inputs, batch_img_metas)
 
-        return seg_logits_list
+        return seg_logits
 
-    def inference(self, batch_inputs: Tensor,
-                  batch_img_metas: List[dict]) -> List[Tensor]:
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
         """Inference with slide/whole style.
 
         Args:
-            batch_inputs (Tensor): The input image of shape (N, 3, H, W).
+            inputs (Tensor): The input image of shape (N, 3, H, W).
             batch_img_metas (List[dict]): List of image metainfo where each may
                 also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
-                'ori_shape', and 'pad_shape'.
+                'ori_shape', 'pad_shape', and 'padding_size'.
                 For details on the values of these keys see
                 `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
 
         Returns:
-            List[:obj:`Tensor`]: List of segmentation results, seg_logits from
-                model of each input image.
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
         """
-
-        assert self.test_cfg.mode in ['slide', 'whole']
+        assert self.test_cfg.get('mode', 'whole') in ['slide', 'whole'], \
+            f'Only "slide" or "whole" test mode are supported, but got ' \
+            f'{self.test_cfg["mode"]}.'
         ori_shape = batch_img_metas[0]['ori_shape']
-        assert all(_['ori_shape'] == ori_shape for _ in batch_img_metas)
+        if not all(_['ori_shape'] == ori_shape for _ in batch_img_metas):
+            print_log(
+                'Image shapes are different in the batch.',
+                logger='current',
+                level=logging.WARN)
         if self.test_cfg.mode == 'slide':
-            seg_logit_list = self.slide_inference(batch_inputs,
-                                                  batch_img_metas)
+            seg_logit = self.slide_inference(inputs, batch_img_metas)
         else:
-            seg_logit_list = self.whole_inference(batch_inputs,
-                                                  batch_img_metas)
+            seg_logit = self.whole_inference(inputs, batch_img_metas)
 
-        return seg_logit_list
+        return seg_logit
 
-    def aug_test(self, batch_inputs, batch_img_metas, rescale=True):
+    def aug_test(self, inputs, batch_img_metas, rescale=True):
         """Test with augmentations.
 
         Only rescale=True is supported.
@@ -346,13 +352,12 @@ def aug_test(self, batch_inputs, batch_img_metas, rescale=True):
         # aug_test rescale all imgs back to ori_shape for now
         assert rescale
         # to save memory, we get augmented seg logit inplace
-        seg_logit = self.inference(batch_inputs[0], batch_img_metas[0],
-                                   rescale)
-        for i in range(1, len(batch_inputs)):
-            cur_seg_logit = self.inference(batch_inputs[i], batch_img_metas[i],
+        seg_logit = self.inference(inputs[0], batch_img_metas[0], rescale)
+        for i in range(1, len(inputs)):
+            cur_seg_logit = self.inference(inputs[i], batch_img_metas[i],
                                            rescale)
             seg_logit += cur_seg_logit
-        seg_logit /= len(batch_inputs)
+        seg_logit /= len(inputs)
         seg_pred = seg_logit.argmax(dim=1)
         # unravel batch dim
         seg_pred = list(seg_pred)
diff --git a/mmseg/models/segmentors/multimodal_encoder_decoder.py b/mmseg/models/segmentors/multimodal_encoder_decoder.py
new file mode 100644
index 0000000000..75aa8b9b17
--- /dev/null
+++ b/mmseg/models/segmentors/multimodal_encoder_decoder.py
@@ -0,0 +1,350 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Optional
+
+import torch.nn.functional as F
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import (ConfigType, OptConfigType, OptMultiConfig,
+                         OptSampleList, SampleList, add_prefix)
+from .base import BaseSegmentor
+
+
+@MODELS.register_module()
+class MultimodalEncoderDecoder(BaseSegmentor):
+    """Multimodal Encoder-Decoder segmentors.
+
+    Multimodal segmentation architecture is used for open-vocabulary
+    semantic segmentation with combining the visual and language
+    pretrain models. It consists of a image_encoder (backbone) to extract
+    visual feature, a text encoder to extract text feature, and a decode
+    head to generate semantic maps.
+    Note that the deep supervision during training is implemented in decode head.
+
+    1. The ``loss`` method is used to calculate the loss of model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2) Call the decode head loss function to forward decode head model and
+    calculate losses.
+
+    .. code:: text
+
+     loss(): extract_feat() -> _decode_head_forward_train()
+     _decode_head_forward_train(): decode_head.loss()
+
+    2. The ``predict`` method is used to predict segmentation results,
+    which includes two steps: (1) Run inference function to obtain the list of
+    seg_logits (2) Call post-processing function to obtain list of
+    ``SegDataSampel`` including ``pred_sem_seg`` and ``seg_logits``.
+
+    .. code:: text
+
+     predict(): inference() -> postprocess_result()
+     inference(): whole_inference()/slide_inference()
+     whole_inference()/slide_inference(): encoder_decoder()
+     encoder_decoder(): extract_feat() -> decode_head.predict()
+
+    3. The ``_forward`` method is used to output the tensor by running the model,
+    which includes two steps: (1) Extracts features to obtain the feature maps
+    (2)Call the decode head forward function to forward decode head model.
+
+    .. code:: text
+
+     _forward(): extract_feat() -> _decode_head.forward()
+
+    Args:
+
+        image_encoder (ConfigType): The config for the visual encoder of segmentor.
+        text_encoder ((ConfigType): The config for the text encoder of segmentor.
+        decode_head (ConfigType): The config for the decode head of segmentor.
+        train_cfg (OptConfigType): The config for training. Defaults to None.
+        test_cfg (OptConfigType): The config for testing. Defaults to None.
+        data_preprocessor (dict, optional): The pre-process config of
+            :class:`BaseDataPreprocessor`.
+        pretrained (str, optional): The path for pretrained model.
+            Defaults to None.
+        asymetric_input (bool): whether to use different size of input for image encoder
+            and decode head. Defaults to False.
+        encoder_resolution (float): resize scale of input images for image encoder.
+            Defaults to None.
+        init_cfg (dict, optional): The weight initialized config for
+            :class:`BaseModule`.
+    """  # noqa: E501
+
+    def __init__(self,
+                 image_encoder: ConfigType,
+                 text_encoder: ConfigType,
+                 decode_head: ConfigType,
+                 train_cfg: OptConfigType = None,
+                 test_cfg: OptConfigType = None,
+                 data_preprocessor: OptConfigType = None,
+                 pretrained: Optional[str] = None,
+                 asymetric_input: bool = True,
+                 encoder_resolution: float = None,
+                 init_cfg: OptMultiConfig = None):
+        super().__init__(
+            data_preprocessor=data_preprocessor, init_cfg=init_cfg)
+        if pretrained is not None:
+            image_encoder.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+            text_encoder.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+            decode_head.init_cfg = dict(
+                type='Pretrained_Part', checkpoint=pretrained)
+
+        if asymetric_input:
+            assert encoder_resolution is not None, \
+                'if asymetric_input set True, ' \
+                'clip_resolution must be a certain value'
+        self.asymetric_input = asymetric_input
+        self.encoder_resolution = encoder_resolution
+        self.image_encoder = MODELS.build(image_encoder)
+        self.text_encoder = MODELS.build(text_encoder)
+        self._init_decode_head(decode_head)
+
+        self.train_cfg = train_cfg
+        self.test_cfg = test_cfg
+
+        assert self.with_decode_head
+
+    def _init_decode_head(self, decode_head: ConfigType) -> None:
+        """Initialize ``decode_head``"""
+        self.decode_head = MODELS.build(decode_head)
+        self.align_corners = self.decode_head.align_corners
+        self.num_classes = self.decode_head.num_classes
+        self.out_channels = self.decode_head.out_channels
+
+    def extract_feat(self, inputs: Tensor) -> List[Tensor]:
+        """Extract visual features from images."""
+        x = self.image_encoder(inputs)
+        return x
+
+    def encode_decode(self, inputs: Tensor,
+                      batch_img_metas: List[dict]) -> Tensor:
+        """Encode the name of classes with text_encoder and encode images with
+        image_encoder.
+
+        Then decode the class embedding and visual feature into a semantic
+        segmentation map of the same size as input.
+        """
+        classifier_embeds = self.text_encoder()
+        clip_inputs = inputs
+        if self.asymetric_input:
+            clip_inputs = F.interpolate(
+                inputs, scale_factor=self.encoder_resolution, mode='bilinear')
+        x = self.image_encoder(clip_inputs)
+        seg_logits = self.decode_head.predict([inputs, x, classifier_embeds],
+                                              batch_img_metas, self.test_cfg)
+
+        return seg_logits
+
+    def _decode_head_forward_train(self, inputs: List[Tensor],
+                                   data_samples: SampleList) -> dict:
+        """Run forward function and calculate loss for decode head in
+        training."""
+        losses = dict()
+        loss_decode = self.decode_head.loss(inputs, data_samples,
+                                            self.train_cfg)
+
+        losses.update(add_prefix(loss_decode, 'decode'))
+        return losses
+
+    def loss(self, inputs: Tensor, data_samples: SampleList) -> dict:
+        """Calculate losses from a batch of inputs and data samples.
+
+        Args:
+            inputs (Tensor): Input images.
+            data_samples (list[:obj:`SegDataSample`]): The seg data samples.
+                It usually includes information such as `metainfo` and
+                `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        classifier_embeds = self.text_encoder()
+        clip_inputs = inputs
+        if self.asymetric_input:
+            clip_inputs = F.interpolate(
+                inputs, scale_factor=self.encoder_resolution, mode='bilinear')
+        x = self.image_encoder(clip_inputs)
+
+        losses = dict()
+
+        loss_decode = self._decode_head_forward_train(
+            [inputs, x, classifier_embeds], data_samples)
+        losses.update(loss_decode)
+
+        return losses
+
+    def predict(self,
+                inputs: Tensor,
+                data_samples: OptSampleList = None) -> SampleList:
+        """Predict results from a batch of inputs and data samples with post-
+        processing.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`], optional): The seg data
+                samples. It usually includes information such as `metainfo`
+                and `gt_sem_seg`.
+
+        Returns:
+            list[:obj:`SegDataSample`]: Segmentation results of the
+            input images. Each SegDataSample usually contain:
+
+            - ``pred_sem_seg``(PixelData): Prediction of semantic segmentation.
+            - ``seg_logits``(PixelData): Predicted logits of semantic
+                segmentation before normalization.
+        """
+        if data_samples is not None:
+            batch_img_metas = [
+                data_sample.metainfo for data_sample in data_samples
+            ]
+        else:
+            batch_img_metas = [
+                dict(
+                    ori_shape=inputs.shape[2:],
+                    img_shape=inputs.shape[2:],
+                    pad_shape=inputs.shape[2:],
+                    padding_size=[0, 0, 0, 0])
+            ] * inputs.shape[0]
+
+        seg_logits = self.inference(inputs, batch_img_metas)
+
+        return self.postprocess_result(seg_logits, data_samples)
+
+    def _forward(self,
+                 inputs: Tensor,
+                 data_samples: OptSampleList = None) -> Tensor:
+        """Network forward process.
+
+        Args:
+            inputs (Tensor): Inputs with shape (N, C, H, W).
+            data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            Tensor: Forward output of model without any post-processes.
+        """
+        x = self.extract_feat(inputs)
+        return self.decode_head.forward(x)
+
+    def slide_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference by sliding-window with overlap.
+
+        If h_crop > h_img or w_crop > w_img, the small patch will be used to
+        decode without padding.
+
+        Args:
+            inputs (tensor): the tensor should have a shape NxCxHxW,
+                which contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        h_stride, w_stride = self.test_cfg.stride
+        h_crop, w_crop = self.test_cfg.crop_size
+        batch_size, _, h_img, w_img = inputs.size()
+        out_channels = self.out_channels
+        h_grids = max(h_img - h_crop + h_stride - 1, 0) // h_stride + 1
+        w_grids = max(w_img - w_crop + w_stride - 1, 0) // w_stride + 1
+        preds = inputs.new_zeros((batch_size, out_channels, h_img, w_img))
+        count_mat = inputs.new_zeros((batch_size, 1, h_img, w_img))
+        for h_idx in range(h_grids):
+            for w_idx in range(w_grids):
+                y1 = h_idx * h_stride
+                x1 = w_idx * w_stride
+                y2 = min(y1 + h_crop, h_img)
+                x2 = min(x1 + w_crop, w_img)
+                y1 = max(y2 - h_crop, 0)
+                x1 = max(x2 - w_crop, 0)
+                crop_img = inputs[:, :, y1:y2, x1:x2]
+                # change the image shape to patch shape
+                batch_img_metas[0]['img_shape'] = crop_img.shape[2:]
+                # the output of encode_decode is seg logits tensor map
+                # with shape [N, C, H, W]
+                crop_seg_logit = self.encode_decode(crop_img, batch_img_metas)
+                preds += F.pad(crop_seg_logit,
+                               (int(x1), int(preds.shape[3] - x2), int(y1),
+                                int(preds.shape[2] - y2)))
+
+                count_mat[:, :, y1:y2, x1:x2] += 1
+        assert (count_mat == 0).sum() == 0
+        seg_logits = preds / count_mat
+
+        return seg_logits
+
+    def whole_inference(self, inputs: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Inference with full image.
+
+        Args:
+            inputs (Tensor): The tensor should have a shape NxCxHxW, which
+                contains all images in the batch.
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', and 'pad_shape'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        seg_logits = self.encode_decode(inputs, batch_img_metas)
+
+        return seg_logits
+
+    def inference(self, inputs: Tensor, batch_img_metas: List[dict]) -> Tensor:
+        """Inference with slide/whole style.
+
+        Args:
+            inputs (Tensor): The input image of shape (N, 3, H, W).
+            batch_img_metas (List[dict]): List of image metainfo where each may
+                also contain: 'img_shape', 'scale_factor', 'flip', 'img_path',
+                'ori_shape', 'pad_shape', and 'padding_size'.
+                For details on the values of these keys see
+                `mmseg/datasets/pipelines/formatting.py:PackSegInputs`.
+
+        Returns:
+            Tensor: The segmentation results, seg_logits from model of each
+                input image.
+        """
+
+        assert self.test_cfg.mode in ['slide', 'whole']
+        ori_shape = batch_img_metas[0]['ori_shape']
+        assert all(_['ori_shape'] == ori_shape for _ in batch_img_metas)
+        if self.test_cfg.mode == 'slide':
+            seg_logit = self.slide_inference(inputs, batch_img_metas)
+        else:
+            seg_logit = self.whole_inference(inputs, batch_img_metas)
+
+        return seg_logit
+
+    def aug_test(self, inputs, batch_img_metas, rescale=True):
+        """Test with augmentations.
+
+        Only rescale=True is supported.
+        """
+        # aug_test rescale all imgs back to ori_shape for now
+        assert rescale
+        # to save memory, we get augmented seg logit inplace
+        seg_logit = self.inference(inputs[0], batch_img_metas[0], rescale)
+        for i in range(1, len(inputs)):
+            cur_seg_logit = self.inference(inputs[i], batch_img_metas[i],
+                                           rescale)
+            seg_logit += cur_seg_logit
+        seg_logit /= len(inputs)
+        seg_pred = seg_logit.argmax(dim=1)
+        # unravel batch dim
+        seg_pred = list(seg_pred)
+        return seg_pred
diff --git a/mmseg/models/segmentors/seg_tta.py b/mmseg/models/segmentors/seg_tta.py
new file mode 100644
index 0000000000..63ef61d223
--- /dev/null
+++ b/mmseg/models/segmentors/seg_tta.py
@@ -0,0 +1,47 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+from mmengine.model import BaseTTAModel
+from mmengine.structures import PixelData
+
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+
+
+@MODELS.register_module()
+class SegTTAModel(BaseTTAModel):
+
+    def merge_preds(self, data_samples_list: List[SampleList]) -> SampleList:
+        """Merge predictions of enhanced data to one prediction.
+
+        Args:
+            data_samples_list (List[SampleList]): List of predictions
+                of all enhanced data.
+
+        Returns:
+            SampleList: Merged prediction.
+        """
+        predictions = []
+        for data_samples in data_samples_list:
+            seg_logits = data_samples[0].seg_logits.data
+            logits = torch.zeros(seg_logits.shape).to(seg_logits)
+            for data_sample in data_samples:
+                seg_logit = data_sample.seg_logits.data
+                if self.module.out_channels > 1:
+                    logits += seg_logit.softmax(dim=0)
+                else:
+                    logits += seg_logit.sigmoid()
+            logits /= len(data_samples)
+            if self.module.out_channels == 1:
+                seg_pred = (logits > self.module.decode_head.threshold
+                            ).to(logits).squeeze(1)
+            else:
+                seg_pred = logits.argmax(dim=0)
+            data_sample.set_data({'pred_sem_seg': PixelData(data=seg_pred)})
+            if hasattr(data_samples[0], 'gt_sem_seg'):
+                data_sample.set_data(
+                    {'gt_sem_seg': data_samples[0].gt_sem_seg})
+            data_sample.set_metainfo({'img_path': data_samples[0].img_path})
+            predictions.append(data_sample)
+        return predictions
diff --git a/mmseg/models/text_encoder/__init__.py b/mmseg/models/text_encoder/__init__.py
new file mode 100644
index 0000000000..199856d9d7
--- /dev/null
+++ b/mmseg/models/text_encoder/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .clip_text_encoder import CLIPTextEncoder
+
+__all__ = ['CLIPTextEncoder']
diff --git a/mmseg/models/text_encoder/clip_text_encoder.py b/mmseg/models/text_encoder/clip_text_encoder.py
new file mode 100644
index 0000000000..1a18b86395
--- /dev/null
+++ b/mmseg/models/text_encoder/clip_text_encoder.py
@@ -0,0 +1,229 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import numpy as np
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from mmengine.model import BaseModule, ModuleList
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+from torch.nn import functional as F
+
+from mmseg.registry import MODELS
+from mmseg.utils import get_classes, get_predefined_templates, tokenizer
+
+
+@MODELS.register_module()
+class CLIPTextEncoder(BaseModule):
+    """A text encoder with transformer architecture to encode the label text.
+
+    Modified from https://github.com/MendelXu/SAN/blob/main/san/model/clip_utils/classifier.py # noqa:E501
+    Copyright (c) 2023 MendelXu.
+    Licensed under the MIT License
+
+    Args:
+        dataset_name: (str|None): The name of the dataset to which
+            the data belongs.
+        vocabulary: (List[str]|None): The list of class names. Default: None.
+        templates: (List[str]|None): The prompt template used for labels.
+            Default: None.
+        total_vocab_size: (int): Number of all words used by the pre-trained
+            model. Default: 49408 (CLIP).
+        context_length: (int): The max length of prompt text.
+            Default: 77 (CLIP).
+        embed_dims: (int): Width of transformer model. Default: 512.
+        num_layers: (int): Depth of transformer. Default: 12,
+        num_heads: (int): Number of attention heads in transformer.
+            Default: 8,
+        mlp_ratio: (int) Ratio of mlp hidden dim to embedding dim in
+            transformer. Default: 4,
+        output_dims: (int) Dim of output text embeddings. Default: 512,
+        cache_feature: (bool) Whether to save class embeddings in cache.
+            Default: True,
+        cat_bg: (bool) Whether to add background embedding. Default: True.
+        norm_cfg (dict|None): Config for norm layer. Default: dict(type='LN')
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(self,
+                 dataset_name: str = None,
+                 vocabulary: List[str] = None,
+                 templates: str = 'vild',
+                 total_vocab_size: int = 49408,
+                 context_length: int = 77,
+                 embed_dims: int = 512,
+                 num_layers: int = 12,
+                 num_heads: int = 8,
+                 mlp_ratio: int = 4,
+                 output_dims: int = 512,
+                 cache_feature: bool = True,
+                 cat_bg: bool = True,
+                 norm_cfg: dict = dict(type='LN'),
+                 init_cfg: dict = None):
+        super().__init__(init_cfg)
+        if isinstance(templates, List):
+            self.templates = templates
+        else:
+            self.templates = get_predefined_templates(templates)
+
+        assert dataset_name is not None or vocabulary is not None, \
+            "text_encoder required either 'dataset_name' or 'vocabulary'"
+        assert dataset_name is None or vocabulary is None, \
+            "there is conflict between 'dataset_name' and 'vocabulary'"
+        self.dataset_name = dataset_name
+        self.vocabulary = vocabulary
+        self.num_pos = context_length
+        self.token_embedding = nn.Embedding(total_vocab_size, embed_dims)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(context_length, embed_dims))
+        self.text_projection = nn.Parameter(
+            torch.empty(embed_dims, output_dims))
+        self.logit_scale = nn.Parameter(torch.ones([]) * np.log(1 / 0.07))
+        self.transformer = ModuleList()
+        self.register_buffer(
+            'attn_mask', self.build_attention_mask(), persistent=False)
+        for i in range(num_layers):
+            self.transformer.append(
+                BaseTransformerLayer(
+                    attn_cfgs=dict(
+                        type='MultiheadAttention',
+                        embed_dims=embed_dims,
+                        num_heads=num_heads,
+                        batch_first=False,
+                        bias=True),
+                    ffn_cfgs=dict(
+                        type='FFN',
+                        embed_dims=embed_dims,
+                        feedforward_channels=mlp_ratio * embed_dims,
+                        act_cfg=dict(type='QuickGELU')),
+                    operation_order=('norm', 'self_attn', 'norm', 'ffn')))
+        self.ln_final = build_norm_layer(
+            norm_cfg, embed_dims, postfix='_final')[1]
+
+        self.cache_feature = cache_feature
+        if self.cache_feature:
+            self.cache = {}
+
+        self._freeze()
+
+        self.cat_bg = cat_bg
+        if self.cat_bg:
+            self.bg_embed = nn.Parameter(
+                torch.randn(1, self.text_projection.shape[1]))
+
+    @property
+    def ln_final(self):
+        return getattr(self, self.final_name)
+
+    def build_attention_mask(self):
+        """lazily create causal attention mask, with full attention between the
+        tokens.
+
+        pytorch uses additive attention mask; fill with -inf
+        """
+        mask = torch.empty(self.num_pos, self.num_pos)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    def _freeze(self):
+        for param in self.parameters():
+            param.requires_grad = False
+
+    def init_weights(self):
+        if self.cat_bg:
+            nn.init.normal_(
+                self.bg_embed,
+                std=self.bg_embed.shape[1]**-0.5,
+            )
+        if isinstance(self.init_cfg, dict) and \
+                self.init_cfg.get('type') == 'Pretrained_Part':
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            state_dict = checkpoint.copy()
+            para_prefix = 'text_encoder'
+            prefix_len = len(para_prefix) + 1
+            for k, v in checkpoint.items():
+                state_dict.pop(k)
+                if para_prefix in k:
+                    state_dict[k[prefix_len:]] = v
+
+            load_state_dict(self, state_dict, strict=False, logger=None)
+
+        else:
+            super().init_weights()
+
+    @torch.no_grad()
+    def encode_text(self, text, normalize=False):
+        """encode class token."""
+
+        embed_device = self.token_embedding.weight.device
+        x = self.token_embedding(
+            text.to(embed_device))  # [batch_size, n_ctx, d_model]
+        x = x + self.positional_embedding
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        for block in self.transformer:
+            x = block(query=x, attn_masks=self.attn_mask)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x)  # [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding
+        # (eot_token is the highest number in each sequence)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+        return F.normalize(x, dim=-1) if normalize else x
+
+    def template_encode(self, vocabulary):
+        """Prompt engineering."""
+        text_embed_bucket = []
+        for template in self.templates:
+            text_inputs = tokenizer.tokenize(
+                [template.format(noun) for noun in vocabulary])
+            text_embed = self.encode_text(text_inputs, normalize=True)
+            text_embed_bucket.append(text_embed)
+        text_embed = torch.stack(text_embed_bucket).mean(dim=0)
+        text_embed = text_embed / text_embed.norm(dim=-1, keepdim=True)
+        return text_embed
+
+    def forward(self):
+        """Forward function."""
+        if self.dataset_name is None:  # encoding vocabulary directly
+            class_names = self.vocabulary
+            if self.cache_feature:
+                new_classes = [
+                    word for word in class_names if word not in self.cache
+                ]
+                if len(new_classes) > 0:
+                    class_embeds = self.template_encode(new_classes)
+                    self.cache.update(dict(zip(new_classes, class_embeds)))
+                class_embeds = torch.stack(
+                    [self.cache[word] for word in class_names])
+            else:
+                class_embeds = self.template_encode(class_names)
+
+        else:  # encoding the classes of the dataset
+            class_names = get_classes(self.dataset_name)
+            if class_names[0] == 'background':
+                class_names = class_names[1:]
+            if self.cache_feature:
+                if self.dataset_name not in self.cache:
+                    class_embeds = self.template_encode(class_names)
+                    self.cache[self.dataset_name] = class_embeds
+                else:
+                    class_embeds = self.cache[self.dataset_name]
+            else:
+                class_embeds = self.template_encode(class_names)
+
+        if self.cat_bg:
+            class_embeds = torch.cat([class_embeds, self.bg_embed])
+            class_embeds = F.normalize(class_embeds, p=2, dim=-1)
+        return self.logit_scale.exp() * class_embeds
+
+
+@MODELS.register_module()
+class QuickGELU(nn.Module):
+    # From https://github.com/openai/CLIP/blob/main/clip/model.py
+    def forward(self, x: torch.Tensor):
+        return x * torch.sigmoid(1.702 * x)
diff --git a/mmseg/models/utils/__init__.py b/mmseg/models/utils/__init__.py
index 6d8329021b..c0751b17c0 100644
--- a/mmseg/models/utils/__init__.py
+++ b/mmseg/models/utils/__init__.py
@@ -1,7 +1,11 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from .basic_block import BasicBlock, Bottleneck
 from .embed import PatchEmbed
+from .encoding import Encoding
 from .inverted_residual import InvertedResidual, InvertedResidualV3
 from .make_divisible import make_divisible
+from .point_sample import get_uncertain_point_coords_with_randomness
+from .ppm import DAPPM, PAPPM
 from .res_layer import ResLayer
 from .se_layer import SELayer
 from .self_attention_block import SelfAttentionBlock
@@ -9,8 +13,15 @@
                             nlc_to_nchw)
 from .up_conv_block import UpConvBlock
 
+# isort: off
+from .wrappers import Upsample, resize
+from .san_layers import MLP, LayerNorm2d, cross_attn_layer
+
 __all__ = [
     'ResLayer', 'SelfAttentionBlock', 'make_divisible', 'InvertedResidual',
     'UpConvBlock', 'InvertedResidualV3', 'SELayer', 'PatchEmbed',
-    'nchw_to_nlc', 'nlc_to_nchw', 'nchw2nlc2nchw', 'nlc2nchw2nlc'
+    'nchw_to_nlc', 'nlc_to_nchw', 'nchw2nlc2nchw', 'nlc2nchw2nlc', 'Encoding',
+    'Upsample', 'resize', 'DAPPM', 'PAPPM', 'BasicBlock', 'Bottleneck',
+    'cross_attn_layer', 'LayerNorm2d', 'MLP',
+    'get_uncertain_point_coords_with_randomness'
 ]
diff --git a/mmseg/models/utils/basic_block.py b/mmseg/models/utils/basic_block.py
new file mode 100644
index 0000000000..4e1ad8146d
--- /dev/null
+++ b/mmseg/models/utils/basic_block.py
@@ -0,0 +1,143 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Optional
+
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule
+from torch import Tensor
+
+from mmseg.registry import MODELS
+from mmseg.utils import OptConfigType
+
+
+class BasicBlock(BaseModule):
+    """Basic block from `ResNet <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        channels (int): Output channels.
+        stride (int): Stride of the first block. Default: 1.
+        downsample (nn.Module, optional): Downsample operation on identity.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU', inplace=True).
+        act_cfg_out (dict, optional): Config dict for activation layer at the
+            last of the block. Default: None.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    expansion = 1
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 stride: int = 1,
+                 downsample: nn.Module = None,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 act_cfg_out: OptConfigType = dict(type='ReLU', inplace=True),
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv1 = ConvModule(
+            in_channels,
+            channels,
+            kernel_size=3,
+            stride=stride,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            channels,
+            channels,
+            kernel_size=3,
+            padding=1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        self.downsample = downsample
+        if act_cfg_out:
+            self.act = MODELS.build(act_cfg_out)
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+        out = self.conv1(x)
+        out = self.conv2(out)
+
+        if self.downsample:
+            residual = self.downsample(x)
+
+        out += residual
+
+        if hasattr(self, 'act'):
+            out = self.act(out)
+
+        return out
+
+
+class Bottleneck(BaseModule):
+    """Bottleneck block from `ResNet <https://arxiv.org/abs/1512.03385>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        channels (int): Output channels.
+        stride (int): Stride of the first block. Default: 1.
+        downsample (nn.Module, optional): Downsample operation on identity.
+            Default: None.
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict, optional): Config dict for activation layer in
+            ConvModule. Default: dict(type='ReLU', inplace=True).
+        act_cfg_out (dict, optional): Config dict for activation layer at
+            the last of the block. Default: None.
+        init_cfg (dict, optional): Initialization config dict. Default: None.
+    """
+
+    expansion = 2
+
+    def __init__(self,
+                 in_channels: int,
+                 channels: int,
+                 stride: int = 1,
+                 downsample: Optional[nn.Module] = None,
+                 norm_cfg: OptConfigType = dict(type='BN'),
+                 act_cfg: OptConfigType = dict(type='ReLU', inplace=True),
+                 act_cfg_out: OptConfigType = None,
+                 init_cfg: OptConfigType = None):
+        super().__init__(init_cfg)
+        self.conv1 = ConvModule(
+            in_channels, channels, 1, norm_cfg=norm_cfg, act_cfg=act_cfg)
+        self.conv2 = ConvModule(
+            channels,
+            channels,
+            3,
+            stride,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.conv3 = ConvModule(
+            channels,
+            channels * self.expansion,
+            1,
+            norm_cfg=norm_cfg,
+            act_cfg=None)
+        if act_cfg_out:
+            self.act = MODELS.build(act_cfg_out)
+        self.downsample = downsample
+
+    def forward(self, x: Tensor) -> Tensor:
+        residual = x
+
+        out = self.conv1(x)
+        out = self.conv2(out)
+        out = self.conv3(out)
+
+        if self.downsample:
+            residual = self.downsample(x)
+
+        out += residual
+
+        if hasattr(self, 'act'):
+            out = self.act(out)
+
+        return out
diff --git a/mmseg/models/utils/embed.py b/mmseg/models/utils/embed.py
index 1515675e1e..aef0a40b0a 100644
--- a/mmseg/models/utils/embed.py
+++ b/mmseg/models/utils/embed.py
@@ -5,8 +5,8 @@
 import torch.nn as nn
 import torch.nn.functional as F
 from mmcv.cnn import build_conv_layer, build_norm_layer
-from mmcv.runner.base_module import BaseModule
-from mmcv.utils import to_2tuple
+from mmengine.model import BaseModule
+from mmengine.utils import to_2tuple
 
 
 class AdaptivePadding(nn.Module):
@@ -42,7 +42,7 @@ class AdaptivePadding(nn.Module):
 
     def __init__(self, kernel_size=1, stride=1, dilation=1, padding='corner'):
 
-        super(AdaptivePadding, self).__init__()
+        super().__init__()
 
         assert padding in ('same', 'corner')
 
@@ -104,8 +104,8 @@ class PatchEmbed(BaseModule):
         input_size (int | tuple | None): The size of input, which will be
             used to calculate the out size. Only work when `dynamic_size`
             is False. Default: None.
-        init_cfg (`mmcv.ConfigDict`, optional): The Config for initialization.
-            Default: None.
+        init_cfg (`mmengine.ConfigDict`, optional): The Config for
+            initialization. Default: None.
     """
 
     def __init__(self,
@@ -120,7 +120,7 @@ def __init__(self,
                  norm_cfg=None,
                  input_size=None,
                  init_cfg=None):
-        super(PatchEmbed, self).__init__(init_cfg=init_cfg)
+        super().__init__(init_cfg=init_cfg)
 
         self.embed_dims = embed_dims
         if stride is None:
diff --git a/mmseg/ops/encoding.py b/mmseg/models/utils/encoding.py
similarity index 98%
rename from mmseg/ops/encoding.py
rename to mmseg/models/utils/encoding.py
index f397cc54e8..ee4f0574fb 100644
--- a/mmseg/ops/encoding.py
+++ b/mmseg/models/utils/encoding.py
@@ -16,7 +16,7 @@ class Encoding(nn.Module):
     """
 
     def __init__(self, channels, num_codes):
-        super(Encoding, self).__init__()
+        super().__init__()
         # init codewords and smoothing factor
         self.channels, self.num_codes = channels, num_codes
         std = 1. / ((num_codes * channels)**0.5)
diff --git a/mmseg/models/utils/inverted_residual.py b/mmseg/models/utils/inverted_residual.py
index c9cda76822..56190b3bfe 100644
--- a/mmseg/models/utils/inverted_residual.py
+++ b/mmseg/models/utils/inverted_residual.py
@@ -40,7 +40,7 @@ def __init__(self,
                  act_cfg=dict(type='ReLU6'),
                  with_cp=False,
                  **kwargs):
-        super(InvertedResidual, self).__init__()
+        super().__init__()
         self.stride = stride
         assert stride in [1, 2], f'stride must in [1, 2]. ' \
             f'But received {stride}.'
@@ -138,7 +138,7 @@ def __init__(self,
                  norm_cfg=dict(type='BN'),
                  act_cfg=dict(type='ReLU'),
                  with_cp=False):
-        super(InvertedResidualV3, self).__init__()
+        super().__init__()
         self.with_res_shortcut = (stride == 1 and in_channels == out_channels)
         assert stride in [1, 2]
         self.with_cp = with_cp
diff --git a/mmseg/models/utils/point_sample.py b/mmseg/models/utils/point_sample.py
new file mode 100644
index 0000000000..1afc957f3d
--- /dev/null
+++ b/mmseg/models/utils/point_sample.py
@@ -0,0 +1,88 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmcv.ops import point_sample
+from torch import Tensor
+
+
+def get_uncertainty(mask_preds: Tensor, labels: Tensor) -> Tensor:
+    """Estimate uncertainty based on pred logits.
+
+    We estimate uncertainty as L1 distance between 0.0 and the logits
+    prediction in 'mask_preds' for the foreground class in `classes`.
+
+    Args:
+        mask_preds (Tensor): mask predication logits, shape (num_rois,
+            num_classes, mask_height, mask_width).
+
+        labels (Tensor): Either predicted or ground truth label for
+            each predicted mask, of length num_rois.
+
+    Returns:
+        scores (Tensor): Uncertainty scores with the most uncertain
+            locations having the highest uncertainty score,
+            shape (num_rois, 1, mask_height, mask_width)
+    """
+    if mask_preds.shape[1] == 1:
+        gt_class_logits = mask_preds.clone()
+    else:
+        inds = torch.arange(mask_preds.shape[0], device=mask_preds.device)
+        gt_class_logits = mask_preds[inds, labels].unsqueeze(1)
+    return -torch.abs(gt_class_logits)
+
+
+def get_uncertain_point_coords_with_randomness(
+        mask_preds: Tensor, labels: Tensor, num_points: int,
+        oversample_ratio: float, importance_sample_ratio: float) -> Tensor:
+    """Get ``num_points`` most uncertain points with random points during
+    train.
+
+    Sample points in [0, 1] x [0, 1] coordinate space based on their
+    uncertainty. The uncertainties are calculated for each point using
+    'get_uncertainty()' function that takes point's logit prediction as
+    input.
+
+    Args:
+        mask_preds (Tensor): A tensor of shape (num_rois, num_classes,
+            mask_height, mask_width) for class-specific or class-agnostic
+            prediction.
+        labels (Tensor): The ground truth class for each instance.
+        num_points (int): The number of points to sample.
+        oversample_ratio (float): Oversampling parameter.
+        importance_sample_ratio (float): Ratio of points that are sampled
+            via importnace sampling.
+
+    Returns:
+        point_coords (Tensor): A tensor of shape (num_rois, num_points, 2)
+            that contains the coordinates sampled points.
+    """
+    assert oversample_ratio >= 1
+    assert 0 <= importance_sample_ratio <= 1
+    batch_size = mask_preds.shape[0]
+    num_sampled = int(num_points * oversample_ratio)
+    point_coords = torch.rand(
+        batch_size, num_sampled, 2, device=mask_preds.device)
+    point_logits = point_sample(mask_preds, point_coords)
+    # It is crucial to calculate uncertainty based on the sampled
+    # prediction value for the points. Calculating uncertainties of the
+    # coarse predictions first and sampling them for points leads to
+    # incorrect results.  To illustrate this: assume uncertainty func(
+    # logits)=-abs(logits), a sampled point between two coarse
+    # predictions with -1 and 1 logits has 0 logits, and therefore 0
+    # uncertainty value. However, if we calculate uncertainties for the
+    # coarse predictions first, both will have -1 uncertainty,
+    # and sampled point will get -1 uncertainty.
+    point_uncertainties = get_uncertainty(point_logits, labels)
+    num_uncertain_points = int(importance_sample_ratio * num_points)
+    num_random_points = num_points - num_uncertain_points
+    idx = torch.topk(
+        point_uncertainties[:, 0, :], k=num_uncertain_points, dim=1)[1]
+    shift = num_sampled * torch.arange(
+        batch_size, dtype=torch.long, device=mask_preds.device)
+    idx += shift[:, None]
+    point_coords = point_coords.view(-1, 2)[idx.view(-1), :].view(
+        batch_size, num_uncertain_points, 2)
+    if num_random_points > 0:
+        rand_roi_coords = torch.rand(
+            batch_size, num_random_points, 2, device=mask_preds.device)
+        point_coords = torch.cat((point_coords, rand_roi_coords), dim=1)
+    return point_coords
diff --git a/mmseg/models/utils/ppm.py b/mmseg/models/utils/ppm.py
new file mode 100644
index 0000000000..5fe6ff26fa
--- /dev/null
+++ b/mmseg/models/utils/ppm.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule
+from mmengine.model import BaseModule, ModuleList, Sequential
+from torch import Tensor
+
+
+class DAPPM(BaseModule):
+    """DAPPM module in `DDRNet <https://arxiv.org/abs/2101.06085>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        branch_channels (int): Branch channels.
+        out_channels (int): Output channels.
+        num_scales (int): Number of scales.
+        kernel_sizes (list[int]): Kernel sizes of each scale.
+        strides (list[int]): Strides of each scale.
+        paddings (list[int]): Paddings of each scale.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer in ConvModule.
+            Default: dict(order=('norm', 'act', 'conv'), bias=False).
+        upsample_mode (str): Upsample mode. Default: 'bilinear'.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 branch_channels: int,
+                 out_channels: int,
+                 num_scales: int,
+                 kernel_sizes: List[int] = [5, 9, 17],
+                 strides: List[int] = [2, 4, 8],
+                 paddings: List[int] = [2, 4, 8],
+                 norm_cfg: Dict = dict(type='BN', momentum=0.1),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
+                 conv_cfg: Dict = dict(
+                     order=('norm', 'act', 'conv'), bias=False),
+                 upsample_mode: str = 'bilinear'):
+        super().__init__()
+
+        self.num_scales = num_scales
+        self.unsample_mode = upsample_mode
+        self.in_channels = in_channels
+        self.branch_channels = branch_channels
+        self.out_channels = out_channels
+        self.norm_cfg = norm_cfg
+        self.act_cfg = act_cfg
+        self.conv_cfg = conv_cfg
+
+        self.scales = ModuleList([
+            ConvModule(
+                in_channels,
+                branch_channels,
+                kernel_size=1,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+                **conv_cfg)
+        ])
+        for i in range(1, num_scales - 1):
+            self.scales.append(
+                Sequential(*[
+                    nn.AvgPool2d(
+                        kernel_size=kernel_sizes[i - 1],
+                        stride=strides[i - 1],
+                        padding=paddings[i - 1]),
+                    ConvModule(
+                        in_channels,
+                        branch_channels,
+                        kernel_size=1,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                        **conv_cfg)
+                ]))
+        self.scales.append(
+            Sequential(*[
+                nn.AdaptiveAvgPool2d((1, 1)),
+                ConvModule(
+                    in_channels,
+                    branch_channels,
+                    kernel_size=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **conv_cfg)
+            ]))
+        self.processes = ModuleList()
+        for i in range(num_scales - 1):
+            self.processes.append(
+                ConvModule(
+                    branch_channels,
+                    branch_channels,
+                    kernel_size=3,
+                    padding=1,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                    **conv_cfg))
+
+        self.compression = ConvModule(
+            branch_channels * num_scales,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+        self.shortcut = ConvModule(
+            in_channels,
+            out_channels,
+            kernel_size=1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            **conv_cfg)
+
+    def forward(self, inputs: Tensor):
+        feats = []
+        feats.append(self.scales[0](inputs))
+
+        for i in range(1, self.num_scales):
+            feat_up = F.interpolate(
+                self.scales[i](inputs),
+                size=inputs.shape[2:],
+                mode=self.unsample_mode)
+            feats.append(self.processes[i - 1](feat_up + feats[i - 1]))
+
+        return self.compression(torch.cat(feats,
+                                          dim=1)) + self.shortcut(inputs)
+
+
+class PAPPM(DAPPM):
+    """PAPPM module in `PIDNet <https://arxiv.org/abs/2206.02066>`_.
+
+    Args:
+        in_channels (int): Input channels.
+        branch_channels (int): Branch channels.
+        out_channels (int): Output channels.
+        num_scales (int): Number of scales.
+        kernel_sizes (list[int]): Kernel sizes of each scale.
+        strides (list[int]): Strides of each scale.
+        paddings (list[int]): Paddings of each scale.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', momentum=0.1).
+        act_cfg (dict): Config dict for activation layer in ConvModule.
+            Default: dict(type='ReLU', inplace=True).
+        conv_cfg (dict): Config dict for convolution layer in ConvModule.
+            Default: dict(order=('norm', 'act', 'conv'), bias=False).
+        upsample_mode (str): Upsample mode. Default: 'bilinear'.
+    """
+
+    def __init__(self,
+                 in_channels: int,
+                 branch_channels: int,
+                 out_channels: int,
+                 num_scales: int,
+                 kernel_sizes: List[int] = [5, 9, 17],
+                 strides: List[int] = [2, 4, 8],
+                 paddings: List[int] = [2, 4, 8],
+                 norm_cfg: Dict = dict(type='BN', momentum=0.1),
+                 act_cfg: Dict = dict(type='ReLU', inplace=True),
+                 conv_cfg: Dict = dict(
+                     order=('norm', 'act', 'conv'), bias=False),
+                 upsample_mode: str = 'bilinear'):
+        super().__init__(in_channels, branch_channels, out_channels,
+                         num_scales, kernel_sizes, strides, paddings, norm_cfg,
+                         act_cfg, conv_cfg, upsample_mode)
+
+        self.processes = ConvModule(
+            self.branch_channels * (self.num_scales - 1),
+            self.branch_channels * (self.num_scales - 1),
+            kernel_size=3,
+            padding=1,
+            groups=self.num_scales - 1,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            **self.conv_cfg)
+
+    def forward(self, inputs: Tensor):
+        x_ = self.scales[0](inputs)
+        feats = []
+        for i in range(1, self.num_scales):
+            feat_up = F.interpolate(
+                self.scales[i](inputs),
+                size=inputs.shape[2:],
+                mode=self.unsample_mode,
+                align_corners=False)
+            feats.append(feat_up + x_)
+        scale_out = self.processes(torch.cat(feats, dim=1))
+        return self.compression(torch.cat([x_, scale_out],
+                                          dim=1)) + self.shortcut(inputs)
diff --git a/mmseg/models/utils/res_layer.py b/mmseg/models/utils/res_layer.py
index 190a0c5d5a..3dd7a6f75a 100644
--- a/mmseg/models/utils/res_layer.py
+++ b/mmseg/models/utils/res_layer.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmcv.cnn import build_conv_layer, build_norm_layer
-from mmcv.runner import Sequential
+from mmengine.model import Sequential
 from torch import nn as nn
 
 
@@ -93,4 +93,4 @@ def __init__(self,
                     conv_cfg=conv_cfg,
                     norm_cfg=norm_cfg,
                     **kwargs))
-        super(ResLayer, self).__init__(*layers)
+        super().__init__(*layers)
diff --git a/mmseg/models/utils/san_layers.py b/mmseg/models/utils/san_layers.py
new file mode 100644
index 0000000000..2267686daf
--- /dev/null
+++ b/mmseg/models/utils/san_layers.py
@@ -0,0 +1,418 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Modified from https://github.com/MendelXu/SAN/blob/main/san/model/attn_helper.py  # noqa: E501
+# Copyright (c) 2023 MendelXu.
+# Licensed under the MIT License
+
+import warnings
+from typing import Optional
+
+import torch
+from mmcv.cnn.bricks.transformer import BaseTransformerLayer
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+
+def cross_attn_with_self_bias(
+    query: Tensor,
+    key: Tensor,
+    value: Tensor,
+    embed_dim_to_check: int,
+    num_heads: int,
+    in_proj_weight: Tensor,
+    in_proj_bias: Tensor,
+    bias_k: Optional[Tensor],
+    bias_v: Optional[Tensor],
+    add_zero_attn: bool,
+    dropout_p: float,
+    out_proj_weight: Tensor,
+    out_proj_bias: Tensor,
+    training: bool = True,
+    key_padding_mask: Optional[Tensor] = None,
+    need_weights: bool = True,
+    attn_mask: Optional[Tensor] = None,
+    use_separate_proj_weight: bool = False,
+    q_proj_weight: Optional[Tensor] = None,
+    k_proj_weight: Optional[Tensor] = None,
+    v_proj_weight: Optional[Tensor] = None,
+    static_k: Optional[Tensor] = None,
+    static_v: Optional[Tensor] = None,
+):
+    """Forward function of multi-head attention. Modified from
+    multi_head_attention_forward in
+    https://github.com/pytorch/pytorch/blob/main/torch/nn/functional.py.
+
+    Args:
+        query, key, value: map a query and a set of key-value pairs to an output.
+            See "Attention Is All You Need" for more details.
+        embed_dim_to_check: total dimension of the model.
+        num_heads: parallel attention heads.
+        in_proj_weight, in_proj_bias: input projection weight and bias.
+        bias_k, bias_v: bias of the key and value sequences to be added at dim=0.
+        add_zero_attn: add a new batch of zeros to the key and
+                       value sequences at dim=1.
+        dropout_p: probability of an element to be zeroed.
+        out_proj_weight, out_proj_bias: the output projection weight and bias.
+        training: apply dropout if is ``True``.
+        key_padding_mask: if provided, specified padding elements in the key will
+            be ignored by the attention. This is an binary mask. When the value is True,
+            the corresponding value on the attention layer will be filled with -inf.
+        need_weights: output attn_output_weights.
+            Default: `True`
+            Note: `needs_weight` defaults to `True`, but should be set to `False`
+            For best performance when attention weights are not needed.
+            *Setting needs_weights to `True`
+            leads to a significant performance degradation.*
+        attn_mask: 2D mask that prevents attention to certain positions. A 2D mask will be broadcasted for all
+            the batches while a 3D mask allows to specify a different mask for the entries of each batch.
+        use_separate_proj_weight: the function accept the proj. weights for query, key,
+            and value in different forms. If false, in_proj_weight will be used, which is
+            a combination of q_proj_weight, k_proj_weight, v_proj_weight.
+        q_proj_weight, k_proj_weight, v_proj_weight, in_proj_bias: input projection weight and bias.
+        static_k, static_v: static key and value used for attention operators.
+    """  # noqa: E501
+    tgt_len, bsz, embed_dim = query.size()
+    assert embed_dim == embed_dim_to_check
+    # allow MHA to have different sizes for the feature dimension
+    assert key.size(0) == value.size(0) and key.size(1) == value.size(1)
+
+    head_dim = embed_dim // num_heads
+    assert head_dim * num_heads == embed_dim, \
+        'embed_dim must be divisible by num_heads'
+    scaling = float(head_dim)**-0.5
+
+    if not use_separate_proj_weight:
+        if (query is key or torch.equal(
+                query, key)) and (key is value or torch.equal(key, value)):
+            # self-attention
+            raise NotImplementedError('self-attention is not implemented')
+
+        elif key is value or torch.equal(key, value):
+            # encoder-decoder attention
+            # This is inline in_proj function
+            # with in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = F.linear(query, _w, _b)
+
+            if key is None:
+                assert value is None
+                k = None
+                v = None
+                q_k = None
+                q_v = None
+            else:
+                # This is inline in_proj function with
+                # in_proj_weight and in_proj_bias
+                _b = in_proj_bias
+                _start = embed_dim
+                _end = None
+                _w = in_proj_weight[_start:, :]
+                if _b is not None:
+                    _b = _b[_start:]
+                k, v = F.linear(key, _w, _b).chunk(2, dim=-1)
+                q_k, q_v = F.linear(query, _w, _b).chunk(2, dim=-1)
+        else:
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = 0
+            _end = embed_dim
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            q = F.linear(query, _w, _b)
+
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim
+            _end = embed_dim * 2
+            _w = in_proj_weight[_start:_end, :]
+            if _b is not None:
+                _b = _b[_start:_end]
+            k = F.linear(key, _w, _b)
+            q_k = F.linear(query, _w, _b)
+            # This is inline in_proj function with
+            # in_proj_weight and in_proj_bias
+            _b = in_proj_bias
+            _start = embed_dim * 2
+            _end = None
+            _w = in_proj_weight[_start:, :]
+            if _b is not None:
+                _b = _b[_start:]
+            v = F.linear(value, _w, _b)
+            q_v = F.linear(query, _w, _b)
+    else:
+        q_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(q_proj_weight)
+        len1, len2 = q_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == query.size(-1)
+
+        k_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(k_proj_weight)
+        len1, len2 = k_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == key.size(-1)
+
+        v_proj_weight_non_opt = \
+            torch.jit._unwrap_optional(v_proj_weight)
+        len1, len2 = v_proj_weight_non_opt.size()
+        assert len1 == embed_dim and len2 == value.size(-1)
+
+        if in_proj_bias is not None:
+            q = F.linear(query, q_proj_weight_non_opt,
+                         in_proj_bias[0:embed_dim])
+            k = F.linear(key, k_proj_weight_non_opt,
+                         in_proj_bias[embed_dim:(embed_dim * 2)])
+            v = F.linear(value, v_proj_weight_non_opt,
+                         in_proj_bias[(embed_dim * 2):])
+        else:
+            q = F.linear(query, q_proj_weight_non_opt, in_proj_bias)
+            k = F.linear(key, k_proj_weight_non_opt, in_proj_bias)
+            v = F.linear(value, v_proj_weight_non_opt, in_proj_bias)
+    q = q * scaling
+
+    if attn_mask is not None:
+        assert (
+            attn_mask.dtype == torch.float32
+            or attn_mask.dtype == torch.float64
+            or attn_mask.dtype == torch.float16
+            or attn_mask.dtype == torch.uint8 or attn_mask.dtype == torch.bool
+        ), 'Only float, byte, and bool types are supported for ' \
+           'attn_mask, not {}'.format(attn_mask.dtype)
+        if attn_mask.dtype == torch.uint8:
+            warnings.warn('Byte tensor for attn_mask in nn.MultiheadAttention '
+                          'is deprecated. Use bool tensor instead.')
+            attn_mask = attn_mask.to(torch.bool)
+
+        if attn_mask.dim() == 2:
+            attn_mask = attn_mask.unsqueeze(0)
+            if list(attn_mask.size()) != [1, query.size(0), key.size(0)]:
+                raise RuntimeError(
+                    'The size of the 2D attn_mask is not correct.')
+        elif attn_mask.dim() == 3:
+            if list(attn_mask.size()) != [
+                    bsz * num_heads,
+                    query.size(0), key.size(0)
+            ]:
+                raise RuntimeError(
+                    'The size of the 3D attn_mask is not correct.')
+        else:
+            raise RuntimeError(
+                "attn_mask's dimension {} is not supported".format(
+                    attn_mask.dim()))
+        # attn_mask's dim is 3 now.
+
+    # convert ByteTensor key_padding_mask to bool
+    if key_padding_mask is not None and key_padding_mask.dtype == torch.uint8:
+        warnings.warn(
+            'Byte tensor for key_padding_mask in nn.MultiheadAttention '
+            'is deprecated. Use bool tensor instead.')
+        key_padding_mask = key_padding_mask.to(torch.bool)
+
+    if bias_k is not None and bias_v is not None:
+        if static_k is None and static_v is None:
+            k = torch.cat([k, bias_k.repeat(1, bsz, 1)])
+            v = torch.cat([v, bias_v.repeat(1, bsz, 1)])
+            if attn_mask is not None:
+                attn_mask = F.pad(attn_mask, (0, 1))
+            if key_padding_mask is not None:
+                key_padding_mask = F.pad(key_padding_mask, (0, 1))
+        else:
+            assert static_k is None, 'bias cannot be added to static key.'
+            assert static_v is None, 'bias cannot be added to static value.'
+    else:
+        assert bias_k is None
+        assert bias_v is None
+
+    q = q.contiguous().view(tgt_len, bsz * num_heads, head_dim).transpose(0, 1)
+    if k is not None:
+        k = k.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        q_k = q_k.contiguous().view(tgt_len, bsz * num_heads,
+                                    head_dim).transpose(0, 1)
+    if v is not None:
+        v = v.contiguous().view(-1, bsz * num_heads, head_dim).transpose(0, 1)
+        q_v = q_v.contiguous().view(tgt_len, bsz * num_heads,
+                                    head_dim).transpose(0, 1)
+
+    if static_k is not None:
+        assert static_k.size(0) == bsz * num_heads
+        assert static_k.size(2) == head_dim
+        k = static_k
+
+    if static_v is not None:
+        assert static_v.size(0) == bsz * num_heads
+        assert static_v.size(2) == head_dim
+        v = static_v
+
+    src_len = k.size(1)
+
+    if key_padding_mask is not None:
+        assert key_padding_mask.size(0) == bsz
+        assert key_padding_mask.size(1) == src_len
+
+    if add_zero_attn:
+        src_len += 1
+        k = torch.cat(
+            [
+                k,
+                torch.zeros(
+                    (k.size(0), 1) + k.size()[2:],
+                    dtype=k.dtype,
+                    device=k.device),
+            ],
+            dim=1,
+        )
+        v = torch.cat(
+            [
+                v,
+                torch.zeros(
+                    (v.size(0), 1) + v.size()[2:],
+                    dtype=v.dtype,
+                    device=v.device),
+            ],
+            dim=1,
+        )
+        if attn_mask is not None:
+            attn_mask = F.pad(attn_mask, (0, 1))
+        if key_padding_mask is not None:
+            key_padding_mask = F.pad(key_padding_mask, (0, 1))
+
+    attn_output_weights = torch.bmm(q, k.transpose(1, 2))
+    assert list(
+        attn_output_weights.size()) == [bsz * num_heads, tgt_len, src_len]
+
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_output_weights.masked_fill_(attn_mask, float('-inf'))
+        else:
+            attn_output_weights += attn_mask
+
+    if key_padding_mask is not None:
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
+                                                       src_len)
+        attn_output_weights = attn_output_weights.masked_fill(
+            key_padding_mask.unsqueeze(1).unsqueeze(2),
+            float('-inf'),
+        )
+        attn_output_weights = attn_output_weights.view(bsz * num_heads,
+                                                       tgt_len, src_len)
+    # attn_out_weights: [bsz * num_heads, tgt_len, src_len]
+    # ->[bsz * num_heads, tgt_len, src_len+1]
+    self_weight = (q * q_k).sum(
+        dim=-1, keepdim=True)  # [bsz * num_heads, tgt_len, 1]
+    total_attn_output_weights = torch.cat([attn_output_weights, self_weight],
+                                          dim=-1)
+    total_attn_output_weights = F.softmax(total_attn_output_weights, dim=-1)
+    total_attn_output_weights = F.dropout(
+        total_attn_output_weights, p=dropout_p, training=training)
+    attn_output_weights = \
+        total_attn_output_weights[:, :, : -1]
+    # [bsz * num_heads, tgt_len, src_len]
+    self_weight = \
+        total_attn_output_weights[:, :, -1:]  # [bsz * num_heads, tgt_len, 1]
+
+    attn_output = torch.bmm(attn_output_weights,
+                            v)  # [bsz * num_heads, tgt_len, head_dim]
+    attn_output = (attn_output + self_weight * q_v
+                   )  # [bsz * num_heads, tgt_len, head_dim]
+    assert list(attn_output.size()) == [bsz * num_heads, tgt_len, head_dim]
+    attn_output = attn_output.transpose(0, 1).contiguous().view(
+        tgt_len, bsz, embed_dim)
+    attn_output = F.linear(attn_output, out_proj_weight, out_proj_bias)
+
+    if need_weights:
+        # average attention weights over heads
+        attn_output_weights = attn_output_weights.view(bsz, num_heads, tgt_len,
+                                                       src_len)
+        return attn_output, attn_output_weights  # .sum(dim=1) / num_heads
+    else:
+        return attn_output, None
+
+
+def cross_attn_layer(tf_layer: BaseTransformerLayer, x, mem, attn_bias):
+    """Implementation of transformer layer with cross attention. The cross
+    attention shares the embedding weights with self-attention of tf_layer.
+    Args:
+        tf_layer: (TransformerEncoderLayer): The Module of transformer layer.
+        x (Tensor): query [K,N,C]
+        mem (Tensor): key and value [L,N,C]
+        attn_bias (Tensor): attention bias [N*num_head,K,L]
+
+    Return:
+        x (Tensor): cross attention output [K,N,C]
+    """
+    self_attn_layer = tf_layer.attentions[0].attn
+    attn_layer_paras = {
+        'embed_dim_to_check': self_attn_layer.embed_dim,
+        'num_heads': self_attn_layer.num_heads,
+        'in_proj_weight': self_attn_layer.in_proj_weight,
+        'in_proj_bias': self_attn_layer.in_proj_bias,
+        'bias_k': self_attn_layer.bias_k,
+        'bias_v': self_attn_layer.bias_v,
+        'add_zero_attn': self_attn_layer.add_zero_attn,
+        'dropout_p': self_attn_layer.dropout,
+        'out_proj_weight': self_attn_layer.out_proj.weight,
+        'out_proj_bias': self_attn_layer.out_proj.bias,
+        'training': self_attn_layer.training
+    }
+
+    q_x = tf_layer.norms[0](x)
+    k_x = v_x = tf_layer.norms[0](mem)
+    x = x + cross_attn_with_self_bias(
+        q_x,
+        k_x,
+        v_x,
+        attn_mask=attn_bias,
+        need_weights=False,
+        **attn_layer_paras)[0]
+    x = tf_layer.ffns[0](tf_layer.norms[1](x), identity=x)
+    return x
+
+
+class LayerNorm2d(nn.Module):
+    """A LayerNorm variant, popularized by Transformers, that performs point-
+    wise mean and variance normalization over the channel dimension for inputs
+    that have shape (batch_size, channels, height, width).
+
+    https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa B950
+    """
+
+    def __init__(self, normalized_shape, eps=1e-6):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(normalized_shape))
+        self.bias = nn.Parameter(torch.zeros(normalized_shape))
+        self.eps = eps
+        self.normalized_shape = (normalized_shape, )
+
+    def forward(self, x: torch.Tensor):
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
+
+
+class MLP(nn.Module):
+    """Very simple multi-layer perceptron (also called FFN)"""
+
+    def __init__(self,
+                 input_dim,
+                 hidden_dim,
+                 output_dim,
+                 num_layers,
+                 affine_func=nn.Linear):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            affine_func(n, k)
+            for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x: torch.Tensor):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
diff --git a/mmseg/models/utils/se_layer.py b/mmseg/models/utils/se_layer.py
index 16f52aa5c0..0ff632cfea 100644
--- a/mmseg/models/utils/se_layer.py
+++ b/mmseg/models/utils/se_layer.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
 import torch.nn as nn
 from mmcv.cnn import ConvModule
+from mmengine.utils import is_tuple_of
 
 from .make_divisible import make_divisible
 
@@ -30,11 +30,11 @@ def __init__(self,
                  conv_cfg=None,
                  act_cfg=(dict(type='ReLU'),
                           dict(type='HSigmoid', bias=3.0, divisor=6.0))):
-        super(SELayer, self).__init__()
+        super().__init__()
         if isinstance(act_cfg, dict):
             act_cfg = (act_cfg, act_cfg)
         assert len(act_cfg) == 2
-        assert mmcv.is_tuple_of(act_cfg, dict)
+        assert is_tuple_of(act_cfg, dict)
         self.global_avgpool = nn.AdaptiveAvgPool2d(1)
         self.conv1 = ConvModule(
             in_channels=channels,
diff --git a/mmseg/models/utils/self_attention_block.py b/mmseg/models/utils/self_attention_block.py
index c945fa7168..5bb6e8284e 100644
--- a/mmseg/models/utils/self_attention_block.py
+++ b/mmseg/models/utils/self_attention_block.py
@@ -1,6 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import torch
-from mmcv.cnn import ConvModule, constant_init
+from mmcv.cnn import ConvModule
+from mmengine.model.weight_init import constant_init
 from torch import nn as nn
 from torch.nn import functional as F
 
@@ -35,7 +36,7 @@ def __init__(self, key_in_channels, query_in_channels, channels,
                  key_downsample, key_query_num_convs, value_out_num_convs,
                  key_query_norm, value_out_norm, matmul_norm, with_out,
                  conv_cfg, norm_cfg, act_cfg):
-        super(SelfAttentionBlock, self).__init__()
+        super().__init__()
         if share_key_query:
             assert key_in_channels == query_in_channels
         self.key_in_channels = key_in_channels
diff --git a/mmseg/models/utils/up_conv_block.py b/mmseg/models/utils/up_conv_block.py
index d8396d9c2c..4fa3b598de 100644
--- a/mmseg/models/utils/up_conv_block.py
+++ b/mmseg/models/utils/up_conv_block.py
@@ -57,7 +57,7 @@ def __init__(self,
                  upsample_cfg=dict(type='InterpConv'),
                  dcn=None,
                  plugins=None):
-        super(UpConvBlock, self).__init__()
+        super().__init__()
         assert dcn is None, 'Not implemented yet.'
         assert plugins is None, 'Not implemented yet.'
 
diff --git a/mmseg/ops/wrappers.py b/mmseg/models/utils/wrappers.py
similarity index 97%
rename from mmseg/ops/wrappers.py
rename to mmseg/models/utils/wrappers.py
index ce67e4bebe..abbd0c0296 100644
--- a/mmseg/ops/wrappers.py
+++ b/mmseg/models/utils/wrappers.py
@@ -34,7 +34,7 @@ def __init__(self,
                  scale_factor=None,
                  mode='nearest',
                  align_corners=None):
-        super(Upsample, self).__init__()
+        super().__init__()
         self.size = size
         if isinstance(scale_factor, tuple):
             self.scale_factor = tuple(float(factor) for factor in scale_factor)
diff --git a/mmseg/ops/__init__.py b/mmseg/ops/__init__.py
deleted file mode 100644
index bc075cd4eb..0000000000
--- a/mmseg/ops/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .encoding import Encoding
-from .wrappers import Upsample, resize
-
-__all__ = ['Upsample', 'resize', 'Encoding']
diff --git a/mmseg/registry/__init__.py b/mmseg/registry/__init__.py
index c646b7e5ac..ee514d1a2a 100644
--- a/mmseg/registry/__init__.py
+++ b/mmseg/registry/__init__.py
@@ -1,13 +1,15 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .registry import (DATA_SAMPLERS, DATASETS, HOOKS, LOOPS, METRICS,
-                       MODEL_WRAPPERS, MODELS, OPTIM_WRAPPER_CONSTRUCTORS,
-                       OPTIMIZERS, PARAM_SCHEDULERS, RUNNER_CONSTRUCTORS,
-                       RUNNERS, TASK_UTILS, TRANSFORMS, VISBACKENDS,
-                       VISUALIZERS, WEIGHT_INITIALIZERS)
+from .registry import (DATA_SAMPLERS, DATASETS, EVALUATOR, HOOKS, INFERENCERS,
+                       LOG_PROCESSORS, LOOPS, METRICS, MODEL_WRAPPERS, MODELS,
+                       OPTIM_WRAPPER_CONSTRUCTORS, OPTIM_WRAPPERS, OPTIMIZERS,
+                       PARAM_SCHEDULERS, RUNNER_CONSTRUCTORS, RUNNERS,
+                       TASK_UTILS, TRANSFORMS, VISBACKENDS, VISUALIZERS,
+                       WEIGHT_INITIALIZERS)
 
 __all__ = [
-    'RUNNERS', 'RUNNER_CONSTRUCTORS', 'HOOKS', 'DATASETS', 'DATA_SAMPLERS',
-    'TRANSFORMS', 'MODELS', 'WEIGHT_INITIALIZERS', 'OPTIMIZERS',
-    'OPTIM_WRAPPER_CONSTRUCTORS', 'TASK_UTILS', 'PARAM_SCHEDULERS', 'METRICS',
-    'MODEL_WRAPPERS', 'LOOPS', 'VISBACKENDS', 'VISUALIZERS'
+    'HOOKS', 'DATASETS', 'DATA_SAMPLERS', 'TRANSFORMS', 'MODELS',
+    'WEIGHT_INITIALIZERS', 'OPTIMIZERS', 'OPTIM_WRAPPER_CONSTRUCTORS',
+    'TASK_UTILS', 'PARAM_SCHEDULERS', 'METRICS', 'MODEL_WRAPPERS',
+    'VISBACKENDS', 'VISUALIZERS', 'RUNNERS', 'RUNNER_CONSTRUCTORS', 'LOOPS',
+    'EVALUATOR', 'LOG_PROCESSORS', 'OPTIM_WRAPPERS', 'INFERENCERS'
 ]
diff --git a/mmseg/registry/registry.py b/mmseg/registry/registry.py
index a3fcf40c55..37b6a77609 100644
--- a/mmseg/registry/registry.py
+++ b/mmseg/registry/registry.py
@@ -1,20 +1,24 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-"""MMSegmentation provides 17 registry nodes to support using modules across
+"""MMSegmentation provides 21 registry nodes to support using modules across
 projects. Each node is a child of the root registry in MMEngine.
 
 More details can be found at
-https://mmengine.readthedocs.io/en/latest/tutorials/registry.html.
+https://mmengine.readthedocs.io/en/latest/advanced_tutorials/registry.html.
 """
 
 from mmengine.registry import DATA_SAMPLERS as MMENGINE_DATA_SAMPLERS
 from mmengine.registry import DATASETS as MMENGINE_DATASETS
+from mmengine.registry import EVALUATOR as MMENGINE_EVALUATOR
 from mmengine.registry import HOOKS as MMENGINE_HOOKS
+from mmengine.registry import INFERENCERS as MMENGINE_INFERENCERS
+from mmengine.registry import LOG_PROCESSORS as MMENGINE_LOG_PROCESSORS
 from mmengine.registry import LOOPS as MMENGINE_LOOPS
 from mmengine.registry import METRICS as MMENGINE_METRICS
 from mmengine.registry import MODEL_WRAPPERS as MMENGINE_MODEL_WRAPPERS
 from mmengine.registry import MODELS as MMENGINE_MODELS
 from mmengine.registry import \
     OPTIM_WRAPPER_CONSTRUCTORS as MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS
+from mmengine.registry import OPTIM_WRAPPERS as MMENGINE_OPTIM_WRAPPERS
 from mmengine.registry import OPTIMIZERS as MMENGINE_OPTIMIZERS
 from mmengine.registry import PARAM_SCHEDULERS as MMENGINE_PARAM_SCHEDULERS
 from mmengine.registry import \
@@ -36,36 +40,79 @@
 # manage all kinds of loops like `EpochBasedTrainLoop`
 LOOPS = Registry('loop', parent=MMENGINE_LOOPS)
 # manage all kinds of hooks like `CheckpointHook`
-HOOKS = Registry('hook', parent=MMENGINE_HOOKS)
+HOOKS = Registry(
+    'hook', parent=MMENGINE_HOOKS, locations=['mmseg.engine.hooks'])
 
 # manage data-related modules
-DATASETS = Registry('dataset', parent=MMENGINE_DATASETS)
+DATASETS = Registry(
+    'dataset', parent=MMENGINE_DATASETS, locations=['mmseg.datasets'])
 DATA_SAMPLERS = Registry('data sampler', parent=MMENGINE_DATA_SAMPLERS)
-TRANSFORMS = Registry('transform', parent=MMENGINE_TRANSFORMS)
+TRANSFORMS = Registry(
+    'transform',
+    parent=MMENGINE_TRANSFORMS,
+    locations=['mmseg.datasets.transforms'])
 
 # mangage all kinds of modules inheriting `nn.Module`
-MODELS = Registry('model', parent=MMENGINE_MODELS)
+MODELS = Registry('model', parent=MMENGINE_MODELS, locations=['mmseg.models'])
 # mangage all kinds of model wrappers like 'MMDistributedDataParallel'
-MODEL_WRAPPERS = Registry('model_wrapper', parent=MMENGINE_MODEL_WRAPPERS)
+MODEL_WRAPPERS = Registry(
+    'model_wrapper',
+    parent=MMENGINE_MODEL_WRAPPERS,
+    locations=['mmseg.models'])
 # mangage all kinds of weight initialization modules like `Uniform`
 WEIGHT_INITIALIZERS = Registry(
-    'weight initializer', parent=MMENGINE_WEIGHT_INITIALIZERS)
+    'weight initializer',
+    parent=MMENGINE_WEIGHT_INITIALIZERS,
+    locations=['mmseg.models'])
 
 # mangage all kinds of optimizers like `SGD` and `Adam`
-OPTIMIZERS = Registry('optimizer', parent=MMENGINE_OPTIMIZERS)
+OPTIMIZERS = Registry(
+    'optimizer',
+    parent=MMENGINE_OPTIMIZERS,
+    locations=['mmseg.engine.optimizers'])
+# manage optimizer wrapper
+OPTIM_WRAPPERS = Registry(
+    'optim_wrapper',
+    parent=MMENGINE_OPTIM_WRAPPERS,
+    locations=['mmseg.engine.optimizers'])
 # manage constructors that customize the optimization hyperparameters.
 OPTIM_WRAPPER_CONSTRUCTORS = Registry(
-    'optimizer constructor', parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS)
+    'optimizer wrapper constructor',
+    parent=MMENGINE_OPTIM_WRAPPER_CONSTRUCTORS,
+    locations=['mmseg.engine.optimizers'])
 # mangage all kinds of parameter schedulers like `MultiStepLR`
 PARAM_SCHEDULERS = Registry(
-    'parameter scheduler', parent=MMENGINE_PARAM_SCHEDULERS)
+    'parameter scheduler',
+    parent=MMENGINE_PARAM_SCHEDULERS,
+    locations=['mmseg.engine.schedulers'])
+
 # manage all kinds of metrics
-METRICS = Registry('metric', parent=MMENGINE_METRICS)
+METRICS = Registry(
+    'metric', parent=MMENGINE_METRICS, locations=['mmseg.evaluation'])
+# manage evaluator
+EVALUATOR = Registry(
+    'evaluator', parent=MMENGINE_EVALUATOR, locations=['mmseg.evaluation'])
 
 # manage task-specific modules like ohem pixel sampler
-TASK_UTILS = Registry('task util', parent=MMENGINE_TASK_UTILS)
+TASK_UTILS = Registry(
+    'task util', parent=MMENGINE_TASK_UTILS, locations=['mmseg.models'])
 
 # manage visualizer
-VISUALIZERS = Registry('visualizer', parent=MMENGINE_VISUALIZERS)
+VISUALIZERS = Registry(
+    'visualizer',
+    parent=MMENGINE_VISUALIZERS,
+    locations=['mmseg.visualization'])
 # manage visualizer backend
-VISBACKENDS = Registry('vis_backend', parent=MMENGINE_VISBACKENDS)
+VISBACKENDS = Registry(
+    'vis_backend',
+    parent=MMENGINE_VISBACKENDS,
+    locations=['mmseg.visualization'])
+
+# manage logprocessor
+LOG_PROCESSORS = Registry(
+    'log_processor',
+    parent=MMENGINE_LOG_PROCESSORS,
+    locations=['mmseg.visualization'])
+
+# manage inferencer
+INFERENCERS = Registry('inferencer', parent=MMENGINE_INFERENCERS)
diff --git a/mmseg/data/__init__.py b/mmseg/structures/__init__.py
similarity index 100%
rename from mmseg/data/__init__.py
rename to mmseg/structures/__init__.py
diff --git a/mmseg/data/sampler/__init__.py b/mmseg/structures/sampler/__init__.py
similarity index 100%
rename from mmseg/data/sampler/__init__.py
rename to mmseg/structures/sampler/__init__.py
diff --git a/mmseg/data/sampler/base_pixel_sampler.py b/mmseg/structures/sampler/base_pixel_sampler.py
similarity index 100%
rename from mmseg/data/sampler/base_pixel_sampler.py
rename to mmseg/structures/sampler/base_pixel_sampler.py
diff --git a/mmseg/data/sampler/builder.py b/mmseg/structures/sampler/builder.py
similarity index 100%
rename from mmseg/data/sampler/builder.py
rename to mmseg/structures/sampler/builder.py
diff --git a/mmseg/data/sampler/ohem_pixel_sampler.py b/mmseg/structures/sampler/ohem_pixel_sampler.py
similarity index 98%
rename from mmseg/data/sampler/ohem_pixel_sampler.py
rename to mmseg/structures/sampler/ohem_pixel_sampler.py
index e5016ffb63..a974273cab 100644
--- a/mmseg/data/sampler/ohem_pixel_sampler.py
+++ b/mmseg/structures/sampler/ohem_pixel_sampler.py
@@ -23,7 +23,7 @@ class OHEMPixelSampler(BasePixelSampler):
     """
 
     def __init__(self, context, thresh=None, min_kept=100000):
-        super(OHEMPixelSampler, self).__init__()
+        super().__init__()
         self.context = context
         assert min_kept > 1
         self.thresh = thresh
diff --git a/mmseg/data/seg_data_sample.py b/mmseg/structures/seg_data_sample.py
similarity index 94%
rename from mmseg/data/seg_data_sample.py
rename to mmseg/structures/seg_data_sample.py
index ad7677e9db..ce68b54743 100644
--- a/mmseg/data/seg_data_sample.py
+++ b/mmseg/structures/seg_data_sample.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmengine.data import BaseDataElement, PixelData
+from mmengine.structures import BaseDataElement, PixelData
 
 
 class SegDataSample(BaseDataElement):
@@ -15,8 +15,8 @@ class SegDataSample(BaseDataElement):
     Examples:
          >>> import torch
          >>> import numpy as np
-         >>> from mmengine.data import PixelData
-         >>> from mmseg.core import SegDataSample
+         >>> from mmengine.structures import PixelData
+         >>> from mmseg.structures import SegDataSample
 
          >>> data_sample = SegDataSample()
          >>> img_meta = dict(img_shape=(4, 4, 3),
diff --git a/mmseg/utils/__init__.py b/mmseg/utils/__init__.py
index 3bb1ede520..0a2af58c6e 100644
--- a/mmseg/utils/__init__.py
+++ b/mmseg/utils/__init__.py
@@ -1,29 +1,70 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 # yapf: disable
-from .class_names import (ade_classes, ade_palette, cityscapes_classes,
+from .class_names import (ade_classes, ade_palette, bdd100k_classes,
+                          bdd100k_palette, cityscapes_classes,
                           cityscapes_palette, cocostuff_classes,
                           cocostuff_palette, dataset_aliases, get_classes,
                           get_palette, isaid_classes, isaid_palette,
                           loveda_classes, loveda_palette, potsdam_classes,
                           potsdam_palette, stare_classes, stare_palette,
-                          vaihingen_classes, vaihingen_palette, voc_classes,
-                          voc_palette)
+                          synapse_classes, synapse_palette, vaihingen_classes,
+                          vaihingen_palette, voc_classes, voc_palette)
 # yapf: enable
 from .collect_env import collect_env
+from .get_templates import get_predefined_templates
+from .io import datafrombytes
 from .misc import add_prefix, stack_batch
 from .set_env import register_all_modules
-from .typing import (ConfigType, ForwardResults, MultiConfig, OptConfigType,
-                     OptMultiConfig, OptSampleList, SampleList, TensorDict,
-                     TensorList)
+from .tokenizer import tokenize
+from .typing_utils import (ConfigType, ForwardResults, MultiConfig,
+                           OptConfigType, OptMultiConfig, OptSampleList,
+                           SampleList, TensorDict, TensorList)
+
+# isort: off
+from .mask_classification import MatchMasks, seg_data_to_instance_data
 
 __all__ = [
-    'collect_env', 'register_all_modules', 'stack_batch', 'add_prefix',
-    'ConfigType', 'OptConfigType', 'MultiConfig', 'OptMultiConfig',
-    'SampleList', 'OptSampleList', 'TensorDict', 'TensorList',
-    'ForwardResults', 'cityscapes_classes', 'ade_classes', 'voc_classes',
-    'cocostuff_classes', 'loveda_classes', 'potsdam_classes',
-    'vaihingen_classes', 'isaid_classes', 'stare_classes',
-    'cityscapes_palette', 'ade_palette', 'voc_palette', 'cocostuff_palette',
-    'loveda_palette', 'potsdam_palette', 'vaihingen_palette', 'isaid_palette',
-    'stare_palette', 'dataset_aliases', 'get_classes', 'get_palette'
+    'collect_env',
+    'register_all_modules',
+    'stack_batch',
+    'add_prefix',
+    'ConfigType',
+    'OptConfigType',
+    'MultiConfig',
+    'OptMultiConfig',
+    'SampleList',
+    'OptSampleList',
+    'TensorDict',
+    'TensorList',
+    'ForwardResults',
+    'cityscapes_classes',
+    'ade_classes',
+    'voc_classes',
+    'cocostuff_classes',
+    'loveda_classes',
+    'potsdam_classes',
+    'vaihingen_classes',
+    'isaid_classes',
+    'stare_classes',
+    'cityscapes_palette',
+    'ade_palette',
+    'voc_palette',
+    'cocostuff_palette',
+    'loveda_palette',
+    'potsdam_palette',
+    'vaihingen_palette',
+    'isaid_palette',
+    'stare_palette',
+    'dataset_aliases',
+    'get_classes',
+    'get_palette',
+    'datafrombytes',
+    'synapse_palette',
+    'synapse_classes',
+    'get_predefined_templates',
+    'tokenize',
+    'seg_data_to_instance_data',
+    'MatchMasks',
+    'bdd100k_classes',
+    'bdd100k_palette',
 ]
diff --git a/mmseg/utils/bpe_simple_vocab_16e6.txt.gz b/mmseg/utils/bpe_simple_vocab_16e6.txt.gz
new file mode 100644
index 0000000000..7b5088a527
Binary files /dev/null and b/mmseg/utils/bpe_simple_vocab_16e6.txt.gz differ
diff --git a/mmseg/utils/class_names.py b/mmseg/utils/class_names.py
index e3bff62314..161da93a0e 100644
--- a/mmseg/utils/class_names.py
+++ b/mmseg/utils/class_names.py
@@ -1,5 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-import mmcv
+from mmengine.utils import is_str
 
 
 def cityscapes_classes():
@@ -52,6 +52,21 @@ def voc_classes():
     ]
 
 
+def pcontext_classes():
+    """Pascal Context class names for external use."""
+    return [
+        'aeroplane', 'bag', 'bed', 'bedclothes', 'bench', 'bicycle', 'bird',
+        'boat', 'book', 'bottle', 'building', 'bus', 'cabinet', 'car', 'cat',
+        'ceiling', 'chair', 'cloth', 'computer', 'cow', 'cup', 'curtain',
+        'dog', 'door', 'fence', 'floor', 'flower', 'food', 'grass', 'ground',
+        'horse', 'keyboard', 'light', 'motorbike', 'mountain', 'mouse',
+        'person', 'plate', 'platform', 'pottedplant', 'road', 'rock', 'sheep',
+        'shelves', 'sidewalk', 'sign', 'sky', 'snow', 'sofa', 'table', 'track',
+        'train', 'tree', 'truck', 'tvmonitor', 'wall', 'water', 'window',
+        'wood'
+    ]
+
+
 def cocostuff_classes():
     """CocoStuff class names for external use."""
     return [
@@ -126,6 +141,126 @@ def stare_classes():
     return ['background', 'vessel']
 
 
+def mapillary_v1_classes():
+    """mapillary_v1 class names for external use."""
+    return [
+        'Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
+        'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking',
+        'Pedestrian Area', 'Rail Track', 'Road', 'Service Lane', 'Sidewalk',
+        'Bridge', 'Building', 'Tunnel', 'Person', 'Bicyclist', 'Motorcyclist',
+        'Other Rider', 'Lane Marking - Crosswalk', 'Lane Marking - General',
+        'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation', 'Water',
+        'Banner', 'Bench', 'Bike Rack', 'Billboard', 'Catch Basin',
+        'CCTV Camera', 'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole',
+        'Phone Booth', 'Pothole', 'Street Light', 'Pole', 'Traffic Sign Frame',
+        'Utility Pole', 'Traffic Light', 'Traffic Sign (Back)',
+        'Traffic Sign (Front)', 'Trash Can', 'Bicycle', 'Boat', 'Bus', 'Car',
+        'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer',
+        'Truck', 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled'
+    ]
+
+
+def mapillary_v1_palette():
+    """mapillary_v1_ palette for external use."""
+    return [[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+            [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
+            [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
+            [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
+            [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
+            [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
+            [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
+            [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
+            [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
+            [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
+            [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
+            [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
+            [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
+            [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142],
+            [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+            [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10, 10], [0, 0, 0]]
+
+
+def mapillary_v2_classes():
+    """mapillary_v2 class names for external use."""
+    return [
+        'Bird', 'Ground Animal', 'Ambiguous Barrier', 'Concrete Block', 'Curb',
+        'Fence', 'Guard Rail', 'Barrier', 'Road Median', 'Road Side',
+        'Lane Separator', 'Temporary Barrier', 'Wall', 'Bike Lane',
+        'Crosswalk - Plain', 'Curb Cut', 'Driveway', 'Parking',
+        'Parking Aisle', 'Pedestrian Area', 'Rail Track', 'Road',
+        'Road Shoulder', 'Service Lane', 'Sidewalk', 'Traffic Island',
+        'Bridge', 'Building', 'Garage', 'Tunnel', 'Person', 'Person Group',
+        'Bicyclist', 'Motorcyclist', 'Other Rider',
+        'Lane Marking - Dashed Line', 'Lane Marking - Straight Line',
+        'Lane Marking - Zigzag Line', 'Lane Marking - Ambiguous',
+        'Lane Marking - Arrow (Left)', 'Lane Marking - Arrow (Other)',
+        'Lane Marking - Arrow (Right)',
+        'Lane Marking - Arrow (Split Left or Straight)',
+        'Lane Marking - Arrow (Split Right or Straight)',
+        'Lane Marking - Arrow (Straight)', 'Lane Marking - Crosswalk',
+        'Lane Marking - Give Way (Row)', 'Lane Marking - Give Way (Single)',
+        'Lane Marking - Hatched (Chevron)',
+        'Lane Marking - Hatched (Diagonal)', 'Lane Marking - Other',
+        'Lane Marking - Stop Line', 'Lane Marking - Symbol (Bicycle)',
+        'Lane Marking - Symbol (Other)', 'Lane Marking - Text',
+        'Lane Marking (only) - Dashed Line', 'Lane Marking (only) - Crosswalk',
+        'Lane Marking (only) - Other', 'Lane Marking (only) - Test',
+        'Mountain', 'Sand', 'Sky', 'Snow', 'Terrain', 'Vegetation', 'Water',
+        'Banner', 'Bench', 'Bike Rack', 'Catch Basin', 'CCTV Camera',
+        'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole', 'Parking Meter',
+        'Phone Booth', 'Pothole', 'Signage - Advertisement',
+        'Signage - Ambiguous', 'Signage - Back', 'Signage - Information',
+        'Signage - Other', 'Signage - Store', 'Street Light', 'Pole',
+        'Pole Group', 'Traffic Sign Frame', 'Utility Pole', 'Traffic Cone',
+        'Traffic Light - General (Single)', 'Traffic Light - Pedestrians',
+        'Traffic Light - General (Upright)',
+        'Traffic Light - General (Horizontal)', 'Traffic Light - Cyclists',
+        'Traffic Light - Other', 'Traffic Sign - Ambiguous',
+        'Traffic Sign (Back)', 'Traffic Sign - Direction (Back)',
+        'Traffic Sign - Direction (Front)', 'Traffic Sign (Front)',
+        'Traffic Sign - Parking', 'Traffic Sign - Temporary (Back)',
+        'Traffic Sign - Temporary (Front)', 'Trash Can', 'Bicycle', 'Boat',
+        'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle',
+        'Trailer', 'Truck', 'Vehicle Group', 'Wheeled Slow', 'Water Valve',
+        'Car Mount', 'Dynamic', 'Ego Vehicle', 'Ground', 'Static', 'Unlabeled'
+    ]
+
+
+def mapillary_v2_palette():
+    """mapillary_v2_ palette for external use."""
+    return [[165, 42, 42], [0, 192, 0], [250, 170, 31], [250, 170, 32],
+            [196, 196, 196], [190, 153, 153], [180, 165, 180], [90, 120, 150],
+            [250, 170, 33], [250, 170, 34], [128, 128, 128], [250, 170, 35],
+            [102, 102, 156], [128, 64, 255], [140, 140, 200], [170, 170, 170],
+            [250, 170, 36], [250, 170, 160], [250, 170, 37], [96, 96, 96],
+            [230, 150, 140], [128, 64, 128], [110, 110, 110], [110, 110, 110],
+            [244, 35, 232], [128, 196, 128], [150, 100, 100], [70, 70, 70],
+            [150, 150, 150], [150, 120, 90], [220, 20, 60], [220, 20, 60],
+            [255, 0, 0], [255, 0, 100], [255, 0, 200], [255, 255, 255],
+            [255, 255, 255], [250, 170, 29], [250, 170, 28], [250, 170, 26],
+            [250, 170, 25], [250, 170, 24], [250, 170, 22], [250, 170, 21],
+            [250, 170, 20], [255, 255, 255], [250, 170, 19], [250, 170, 18],
+            [250, 170, 12], [250, 170, 11], [255, 255, 255], [255, 255, 255],
+            [250, 170, 16], [250, 170, 15], [250, 170, 15], [255, 255, 255],
+            [255, 255, 255], [255, 255, 255], [255, 255, 255], [64, 170, 64],
+            [230, 160, 50], [70, 130, 180], [190, 255, 255], [152, 251, 152],
+            [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+            [100, 140, 180], [220, 128, 128], [222, 40, 40], [100, 170, 30],
+            [40, 40, 40], [33, 33, 33], [100, 128, 160], [20, 20, 255],
+            [142, 0, 0], [70, 100, 150], [250, 171, 30], [250, 172, 30],
+            [250, 173, 30], [250, 174, 30], [250, 175, 30], [250, 176, 30],
+            [210, 170, 100], [153, 153, 153], [153, 153, 153], [128, 128, 128],
+            [0, 0, 80], [210, 60, 60], [250, 170, 30], [250, 170, 30],
+            [250, 170, 30], [250, 170, 30], [250, 170, 30], [250, 170, 30],
+            [192, 192, 192], [192, 192, 192], [192, 192, 192], [220, 220, 0],
+            [220, 220, 0], [0, 0, 196], [192, 192, 192], [220, 220, 0],
+            [140, 140, 20], [119, 11, 32], [150, 0, 255], [0, 60, 100],
+            [0, 0, 142], [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64],
+            [0, 0, 110], [0, 0, 70], [0, 0, 142], [0, 0, 192], [170, 170, 170],
+            [32, 32, 32], [111, 74, 0], [120, 10, 10], [81, 0, 81],
+            [111, 111, 0], [0, 0, 0]]
+
+
 def cityscapes_palette():
     """Cityscapes palette for external use."""
     return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
@@ -186,6 +321,25 @@ def voc_palette():
             [128, 64, 0], [0, 192, 0], [128, 192, 0], [0, 64, 128]]
 
 
+def pcontext_palette():
+    """Pascal Context palette for external use."""
+    return [[180, 120, 120], [6, 230, 230], [80, 50, 50], [4, 200, 3],
+            [120, 120, 80], [140, 140, 140], [204, 5, 255], [230, 230, 230],
+            [4, 250, 7], [224, 5, 255], [235, 255, 7], [150, 5, 61],
+            [120, 120, 70], [8, 255, 51], [255, 6, 82], [143, 255, 140],
+            [204, 255, 4], [255, 51, 7], [204, 70, 3], [0, 102, 200],
+            [61, 230, 250], [255, 6, 51], [11, 102, 255], [255, 7, 71],
+            [255, 9, 224], [9, 7, 230], [220, 220, 220], [255, 9, 92],
+            [112, 9, 255], [8, 255, 214], [7, 255, 224], [255, 184, 6],
+            [10, 255, 71], [255, 41, 10], [7, 255, 255], [224, 255, 8],
+            [102, 8, 255], [255, 61, 6], [255, 194, 7], [255, 122, 8],
+            [0, 255, 20], [255, 8, 41], [255, 5, 153], [6, 51, 255],
+            [235, 12, 255], [160, 150, 20], [0, 163, 255], [140, 140, 140],
+            [250, 10, 15], [20, 255, 0], [31, 255, 0], [255, 31, 0],
+            [255, 224, 0], [153, 255, 0], [0, 0, 255], [255, 71, 0],
+            [0, 235, 255], [0, 173, 255], [31, 0, 255]]
+
+
 def cocostuff_palette():
     """CocoStuff palette for external use."""
     return [[0, 192, 64], [0, 192, 64], [0, 64, 96], [128, 192, 192],
@@ -265,10 +419,65 @@ def stare_palette():
     return [[120, 120, 120], [6, 230, 230]]
 
 
+def synapse_palette():
+    """Synapse palette for external use."""
+    return [[0, 0, 0], [0, 0, 255], [0, 255, 0], [255, 0, 0], [0, 255, 255],
+            [255, 0, 255], [255, 255, 0], [60, 255, 255], [240, 240, 240]]
+
+
+def synapse_classes():
+    """Synapse class names for external use."""
+    return [
+        'background', 'aorta', 'gallbladder', 'left_kidney', 'right_kidney',
+        'liver', 'pancreas', 'spleen', 'stomach'
+    ]
+
+
+def lip_classes():
+    """LIP class names for external use."""
+    return [
+        'background', 'hat', 'hair', 'glove', 'sunglasses', 'upperclothes',
+        'dress', 'coat', 'socks', 'pants', 'jumpsuits', 'scarf', 'skirt',
+        'face', 'leftArm', 'rightArm', 'leftLeg', 'rightLeg', 'leftShoe',
+        'rightShoe'
+    ]
+
+
+def lip_palette():
+    """LIP palette for external use."""
+    return [
+        'Background', 'Hat', 'Hair', 'Glove', 'Sunglasses', 'UpperClothes',
+        'Dress', 'Coat', 'Socks', 'Pants', 'Jumpsuits', 'Scarf', 'Skirt',
+        'Face', 'Left-arm', 'Right-arm', 'Left-leg', 'Right-leg', 'Left-shoe',
+        'Right-shoe'
+    ]
+
+
+def bdd100k_classes():
+    """BDD100K class names for external use(the class name is compatible with
+    Cityscapes )."""
+    return [
+        'road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+        'traffic light', 'traffic sign', 'vegetation', 'terrain', 'sky',
+        'person', 'rider', 'car', 'truck', 'bus', 'train', 'motorcycle',
+        'bicycle'
+    ]
+
+
+def bdd100k_palette():
+    """bdd100k palette for external use(same with cityscapes)"""
+    return [[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+            [190, 153, 153], [153, 153, 153], [250, 170, 30], [220, 220, 0],
+            [107, 142, 35], [152, 251, 152], [70, 130, 180], [220, 20, 60],
+            [255, 0, 0], [0, 0, 142], [0, 0, 70], [0, 60, 100], [0, 80, 100],
+            [0, 0, 230], [119, 11, 32]]
+
+
 dataset_aliases = {
     'cityscapes': ['cityscapes'],
     'ade': ['ade', 'ade20k'],
     'voc': ['voc', 'pascal_voc', 'voc12', 'voc12aug'],
+    'pcontext': ['pcontext', 'pascal_context', 'voc2010'],
     'loveda': ['loveda'],
     'potsdam': ['potsdam'],
     'vaihingen': ['vaihingen'],
@@ -278,7 +487,13 @@ def stare_palette():
         'coco_stuff164k'
     ],
     'isaid': ['isaid', 'iSAID'],
-    'stare': ['stare', 'STARE']
+    'stare': ['stare', 'STARE'],
+    'lip': ['LIP', 'lip'],
+    'mapillary_v1': ['mapillary_v1'],
+    'mapillary_v2': ['mapillary_v2'],
+    'bdd100k': ['bdd100k'],
+    'zero_mould_v1': ['zero_mould_v1'],
+    'zero_mould_v2': ['zero_mould_v2']
 }
 
 
@@ -289,7 +504,7 @@ def get_classes(dataset):
         for alias in aliases:
             alias2name[alias] = name
 
-    if mmcv.is_str(dataset):
+    if is_str(dataset):
         if dataset in alias2name:
             labels = eval(alias2name[dataset] + '_classes()')
         else:
@@ -306,7 +521,7 @@ def get_palette(dataset):
         for alias in aliases:
             alias2name[alias] = name
 
-    if mmcv.is_str(dataset):
+    if is_str(dataset):
         if dataset in alias2name:
             labels = eval(alias2name[dataset] + '_palette()')
         else:
@@ -314,3 +529,27 @@ def get_palette(dataset):
     else:
         raise TypeError(f'dataset must a str, but got {type(dataset)}')
     return labels
+
+def zero_mould_v1_classes():
+    return [
+        'background', 'CorrectColoured', 'CorrectUncoloured', 'WrongUncoloured'
+    ]
+
+def zero_mould_v1_pallete():
+    return [
+        [0, 0, 0],
+        [255, 0, 0],
+        [0, 200, 100],
+        [255, 225, 0]
+    ]
+
+def zero_mould_v2_classes():
+    return [
+        'background', 'correct-coloured'
+    ]
+
+def zero_mould_v2_pallete():
+    return [
+        [0, 0, 0],
+        [128, 128, 128]
+    ]
diff --git a/mmseg/utils/collect_env.py b/mmseg/utils/collect_env.py
index 3379ecb06b..d5d6ea2902 100644
--- a/mmseg/utils/collect_env.py
+++ b/mmseg/utils/collect_env.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import collect_env as collect_base_env
-from mmcv.utils import get_git_hash
+from mmengine.utils import get_git_hash
+from mmengine.utils.dl_utils import collect_env as collect_base_env
 
 import mmseg
 
@@ -15,4 +15,4 @@ def collect_env():
 
 if __name__ == '__main__':
     for name, val in collect_env().items():
-        print('{}: {}'.format(name, val))
+        print(f'{name}: {val}')
diff --git a/mmseg/utils/get_templates.py b/mmseg/utils/get_templates.py
new file mode 100644
index 0000000000..7e9032ba96
--- /dev/null
+++ b/mmseg/utils/get_templates.py
@@ -0,0 +1,109 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+PREDEFINED_TEMPLATES = {
+    'imagenet': [
+        'a bad photo of a {}.',
+        'a photo of many {}.',
+        'a sculpture of a {}.',
+        'a photo of the hard to see {}.',
+        'a low resolution photo of the {}.',
+        'a rendering of a {}.',
+        'graffiti of a {}.',
+        'a bad photo of the {}.',
+        'a cropped photo of the {}.',
+        'a tattoo of a {}.',
+        'the embroidered {}.',
+        'a photo of a hard to see {}.',
+        'a bright photo of a {}.',
+        'a photo of a clean {}.',
+        'a photo of a dirty {}.',
+        'a dark photo of the {}.',
+        'a drawing of a {}.',
+        'a photo of my {}.',
+        'the plastic {}.',
+        'a photo of the cool {}.',
+        'a close-up photo of a {}.',
+        'a black and white photo of the {}.',
+        'a painting of the {}.',
+        'a painting of a {}.',
+        'a pixelated photo of the {}.',
+        'a sculpture of the {}.',
+        'a bright photo of the {}.',
+        'a cropped photo of a {}.',
+        'a plastic {}.',
+        'a photo of the dirty {}.',
+        'a jpeg corrupted photo of a {}.',
+        'a blurry photo of the {}.',
+        'a photo of the {}.',
+        'a good photo of the {}.',
+        'a rendering of the {}.',
+        'a {} in a video game.',
+        'a photo of one {}.',
+        'a doodle of a {}.',
+        'a close-up photo of the {}.',
+        'a photo of a {}.',
+        'the origami {}.',
+        'the {} in a video game.',
+        'a sketch of a {}.',
+        'a doodle of the {}.',
+        'a origami {}.',
+        'a low resolution photo of a {}.',
+        'the toy {}.',
+        'a rendition of the {}.',
+        'a photo of the clean {}.',
+        'a photo of a large {}.',
+        'a rendition of a {}.',
+        'a photo of a nice {}.',
+        'a photo of a weird {}.',
+        'a blurry photo of a {}.',
+        'a cartoon {}.',
+        'art of a {}.',
+        'a sketch of the {}.',
+        'a embroidered {}.',
+        'a pixelated photo of a {}.',
+        'itap of the {}.',
+        'a jpeg corrupted photo of the {}.',
+        'a good photo of a {}.',
+        'a plushie {}.',
+        'a photo of the nice {}.',
+        'a photo of the small {}.',
+        'a photo of the weird {}.',
+        'the cartoon {}.',
+        'art of the {}.',
+        'a drawing of the {}.',
+        'a photo of the large {}.',
+        'a black and white photo of a {}.',
+        'the plushie {}.',
+        'a dark photo of a {}.',
+        'itap of a {}.',
+        'graffiti of the {}.',
+        'a toy {}.',
+        'itap of my {}.',
+        'a photo of a cool {}.',
+        'a photo of a small {}.',
+        'a tattoo of the {}.',
+    ],
+    'vild': [
+        'a photo of a {}.',
+        'This is a photo of a {}',
+        'There is a {} in the scene',
+        'There is the {} in the scene',
+        'a photo of a {} in the scene',
+        'a photo of a small {}.',
+        'a photo of a medium {}.',
+        'a photo of a large {}.',
+        'This is a photo of a small {}.',
+        'This is a photo of a medium {}.',
+        'This is a photo of a large {}.',
+        'There is a small {} in the scene.',
+        'There is a medium {} in the scene.',
+        'There is a large {} in the scene.',
+    ],
+}
+
+
+def get_predefined_templates(template_set_name: str) -> List[str]:
+    if template_set_name not in PREDEFINED_TEMPLATES:
+        raise ValueError(f'Template set {template_set_name} not found')
+    return PREDEFINED_TEMPLATES[template_set_name]
diff --git a/mmseg/utils/io.py b/mmseg/utils/io.py
new file mode 100644
index 0000000000..7029c3cddd
--- /dev/null
+++ b/mmseg/utils/io.py
@@ -0,0 +1,42 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import gzip
+import io
+import pickle
+
+import cv2
+import numpy as np
+
+
+def datafrombytes(content: bytes, backend: str = 'numpy') -> np.ndarray:
+    """Data decoding from bytes.
+
+    Args:
+        content (bytes): The data bytes got from files or other streams.
+        backend (str): The data decoding backend type. Options are 'numpy',
+            'nifti', 'cv2' and 'pickle'. Defaults to 'numpy'.
+
+    Returns:
+        numpy.ndarray: Loaded data array.
+    """
+    if backend == 'pickle':
+        data = pickle.loads(content)
+    else:
+        with io.BytesIO(content) as f:
+            if backend == 'nifti':
+                f = gzip.open(f)
+                try:
+                    from nibabel import FileHolder, Nifti1Image
+                except ImportError:
+                    print('nifti files io depends on nibabel, please run'
+                          '`pip install nibabel` to install it')
+                fh = FileHolder(fileobj=f)
+                data = Nifti1Image.from_file_map({'header': fh, 'image': fh})
+                data = Nifti1Image.from_bytes(data.to_bytes()).get_fdata()
+            elif backend == 'numpy':
+                data = np.load(f)
+            elif backend == 'cv2':
+                data = np.frombuffer(f.read(), dtype=np.uint8)
+                data = cv2.imdecode(data, cv2.IMREAD_UNCHANGED)
+            else:
+                raise ValueError
+    return data
diff --git a/mmseg/utils/mask_classification.py b/mmseg/utils/mask_classification.py
new file mode 100644
index 0000000000..205d525975
--- /dev/null
+++ b/mmseg/utils/mask_classification.py
@@ -0,0 +1,205 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+from mmcv.ops import point_sample
+from mmengine.structures import InstanceData
+from torch import Tensor
+
+from mmseg.registry import TASK_UTILS
+from mmseg.utils import ConfigType, SampleList
+
+
+def seg_data_to_instance_data(ignore_index: int,
+                              batch_data_samples: SampleList):
+    """Convert the paradigm of ground truth from semantic segmentation to
+    instance segmentation.
+
+    Args:
+        ignore_index (int): The label index to be ignored.
+        batch_data_samples (List[SegDataSample]): The Data
+            Samples. It usually includes information such as
+            `gt_sem_seg`.
+
+    Returns:
+        tuple[Tensor]: A tuple contains two lists.
+            - batch_gt_instances (List[InstanceData]): Batch of
+                gt_instance. It usually includes ``labels``, each is
+                unique ground truth label id of images, with
+                shape (num_gt, ) and ``masks``, each is ground truth
+                masks of each instances of a image, shape (num_gt, h, w).
+            - batch_img_metas (List[Dict]): List of image meta information.
+    """
+    batch_gt_instances = []
+
+    for data_sample in batch_data_samples:
+        gt_sem_seg = data_sample.gt_sem_seg.data
+        classes = torch.unique(
+            gt_sem_seg,
+            sorted=False,
+            return_inverse=False,
+            return_counts=False)
+
+        # remove ignored region
+        gt_labels = classes[classes != ignore_index]
+
+        masks = []
+        for class_id in gt_labels:
+            masks.append(gt_sem_seg == class_id)
+
+        if len(masks) == 0:
+            gt_masks = torch.zeros(
+                (0, gt_sem_seg.shape[-2],
+                 gt_sem_seg.shape[-1])).to(gt_sem_seg).long()
+        else:
+            gt_masks = torch.stack(masks).squeeze(1).long()
+
+        instance_data = InstanceData(labels=gt_labels, masks=gt_masks)
+        batch_gt_instances.append(instance_data)
+    return batch_gt_instances
+
+
+class MatchMasks:
+    """Match the predictions to category labels.
+
+    Args:
+        num_points (int): the number of sampled points to compute cost.
+        num_queries (int): the number of prediction masks.
+        num_classes (int): the number of classes.
+        assigner (BaseAssigner): the assigner to compute matching.
+    """
+
+    def __init__(self,
+                 num_points: int,
+                 num_queries: int,
+                 num_classes: int,
+                 assigner: ConfigType = None):
+        assert assigner is not None, "\'assigner\' in decode_head.train_cfg" \
+                                     'cannot be None'
+        assert num_points > 0, 'num_points should be a positive integer.'
+        self.num_points = num_points
+        self.num_queries = num_queries
+        self.num_classes = num_classes
+        self.assigner = TASK_UTILS.build(assigner)
+
+    def get_targets(self, cls_scores: List[Tensor], mask_preds: List[Tensor],
+                    batch_gt_instances: List[InstanceData]) -> Tuple:
+        """Compute best mask matches for all images for a decoder layer.
+
+        Args:
+            cls_scores (List[Tensor]): Mask score logits from a single
+                decoder layer for all images. Each with shape (num_queries,
+                cls_out_channels).
+            mask_preds (List[Tensor]): Mask logits from a single decoder
+                layer for all images. Each with shape (num_queries, h, w).
+            batch_gt_instances (List[InstanceData]): each contains
+                ``labels`` and ``masks``.
+
+        Returns:
+            tuple: a tuple containing the following targets.
+
+                - labels (List[Tensor]): Labels of all images.\
+                    Each with shape (num_queries, ).
+                - mask_targets (List[Tensor]): Mask targets of\
+                    all images. Each with shape (num_queries, h, w).
+                - mask_weights (List[Tensor]): Mask weights of\
+                    all images. Each with shape (num_queries, ).
+                - avg_factor (int): Average factor that is used to
+                    average the loss. `avg_factor` is usually equal
+                    to the number of positive priors.
+        """
+        batch_size = cls_scores.shape[0]
+        results = dict({
+            'labels': [],
+            'mask_targets': [],
+            'mask_weights': [],
+        })
+        for i in range(batch_size):
+            labels, mask_targets, mask_weights\
+                = self._get_targets_single(cls_scores[i],
+                                           mask_preds[i],
+                                           batch_gt_instances[i])
+            results['labels'].append(labels)
+            results['mask_targets'].append(mask_targets)
+            results['mask_weights'].append(mask_weights)
+
+        # shape (batch_size, num_queries)
+        labels = torch.stack(results['labels'], dim=0)
+        # shape (batch_size, num_gts, h, w)
+        mask_targets = torch.cat(results['mask_targets'], dim=0)
+        # shape (batch_size, num_queries)
+        mask_weights = torch.stack(results['mask_weights'], dim=0)
+
+        avg_factor = sum(
+            [len(gt_instances.labels) for gt_instances in batch_gt_instances])
+
+        res = (labels, mask_targets, mask_weights, avg_factor)
+
+        return res
+
+    def _get_targets_single(self, cls_score: Tensor, mask_pred: Tensor,
+                            gt_instances: InstanceData) \
+            -> Tuple[Tensor, Tensor, Tensor]:
+        """Compute a set of best mask matches for one image.
+
+        Args:
+            cls_score (Tensor): Mask score logits from a single decoder layer
+                for one image. Shape (num_queries, cls_out_channels).
+            mask_pred (Tensor): Mask logits for a single decoder layer for one
+                image. Shape (num_queries, h, w).
+            gt_instances (:obj:`InstanceData`): It contains ``labels`` and
+                ``masks``.
+
+        Returns:
+            tuple[Tensor]: A tuple containing the following for one image.
+
+                - labels (Tensor): Labels of each image. \
+                    shape (num_queries, ).
+                - mask_targets (Tensor): Mask targets of each image. \
+                    shape (num_queries, h, w).
+                - mask_weights (Tensor): Mask weights of each image. \
+                    shape (num_queries, ).
+        """
+        gt_labels = gt_instances.labels
+        gt_masks = gt_instances.masks
+        # when "gt_labels" is empty, classify all queries to background
+        if len(gt_labels) == 0:
+            labels = gt_labels.new_full((self.num_queries, ),
+                                        self.num_classes,
+                                        dtype=torch.long)
+            mask_targets = gt_labels
+            mask_weights = gt_labels.new_zeros((self.num_queries, ))
+            return labels, mask_targets, mask_weights
+        # sample points
+        num_queries = cls_score.shape[0]
+        num_gts = gt_labels.shape[0]
+
+        point_coords = torch.rand((1, self.num_points, 2),
+                                  device=cls_score.device)
+        # shape (num_queries, num_points)
+        mask_points_pred = point_sample(
+            mask_pred.unsqueeze(1), point_coords.repeat(num_queries, 1,
+                                                        1)).squeeze(1)
+        # shape (num_gts, num_points)
+        gt_points_masks = point_sample(
+            gt_masks.unsqueeze(1).float(), point_coords.repeat(num_gts, 1,
+                                                               1)).squeeze(1)
+
+        sampled_gt_instances = InstanceData(
+            labels=gt_labels, masks=gt_points_masks)
+        sampled_pred_instances = InstanceData(
+            scores=cls_score, masks=mask_points_pred)
+        # assign and sample
+        matched_quiery_inds, matched_label_inds = self.assigner.assign(
+            pred_instances=sampled_pred_instances,
+            gt_instances=sampled_gt_instances)
+        labels = gt_labels.new_full((self.num_queries, ),
+                                    self.num_classes,
+                                    dtype=torch.long)
+        labels[matched_quiery_inds] = gt_labels[matched_label_inds]
+
+        mask_weights = gt_labels.new_zeros((self.num_queries, ))
+        mask_weights[matched_quiery_inds] = 1
+        mask_targets = gt_masks[matched_label_inds]
+
+        return labels, mask_targets, mask_weights
diff --git a/mmseg/utils/misc.py b/mmseg/utils/misc.py
index e15b1e0f80..dfc469e832 100644
--- a/mmseg/utils/misc.py
+++ b/mmseg/utils/misc.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn.functional as F
 
-from .typing import SampleList
+from .typing_utils import SampleList
 
 
 def add_prefix(inputs, prefix):
@@ -28,7 +28,7 @@ def add_prefix(inputs, prefix):
 
 
 def stack_batch(inputs: List[torch.Tensor],
-                batch_data_samples: Optional[SampleList] = None,
+                data_samples: Optional[SampleList] = None,
                 size: Optional[tuple] = None,
                 size_divisor: Optional[int] = None,
                 pad_val: Union[int, float] = 0,
@@ -39,8 +39,8 @@ def stack_batch(inputs: List[torch.Tensor],
     Args:
         inputs (List[Tensor]): The input multiple tensors. each is a
             CHW 3D-tensor.
-        batch_data_samples (list[:obj:`SegDataSample`]): The Data
-            Samples. It usually includes information such as `gt_sem_seg`.
+        data_samples (list[:obj:`SegDataSample`]): The list of data samples.
+            It usually includes information such as `gt_sem_seg`.
         size (tuple, optional): Fixed padding size.
         size_divisor (int, optional): The divisor of padded size.
         pad_val (int, float): The padding value. Defaults to 0
@@ -48,17 +48,16 @@ def stack_batch(inputs: List[torch.Tensor],
 
     Returns:
        Tensor: The 4D-tensor.
-       batch_data_samples (list[:obj:`SegDataSample`]): After the padding of
-            the gt_seg_map.
+       List[:obj:`SegDataSample`]: After the padding of the gt_seg_map.
     """
     assert isinstance(inputs, list), \
         f'Expected input type to be list, but got {type(inputs)}'
-    assert len(set([tensor.ndim for tensor in inputs])) == 1, \
+    assert len({tensor.ndim for tensor in inputs}) == 1, \
         f'Expected the dimensions of all inputs must be the same, ' \
         f'but got {[tensor.ndim for tensor in inputs]}'
     assert inputs[0].ndim == 3, f'Expected tensor dimension to be 3, ' \
         f'but got {inputs[0].ndim}'
-    assert len(set([tensor.shape[0] for tensor in inputs])) == 1, \
+    assert len({tensor.shape[0] for tensor in inputs}) == 1, \
         f'Expected the channels of all inputs must be the same, ' \
         f'but got {[tensor.shape[0] for tensor in inputs]}'
 
@@ -93,16 +92,37 @@ def stack_batch(inputs: List[torch.Tensor],
         pad_img = F.pad(tensor, padding_size, value=pad_val)
         padded_inputs.append(pad_img)
         # pad gt_sem_seg
-        if batch_data_samples is not None:
-            data_sample = batch_data_samples[i]
-            gt_sem_seg = data_sample.gt_sem_seg.data
-            del data_sample.gt_sem_seg.data
-            data_sample.gt_sem_seg.data = F.pad(
-                gt_sem_seg, padding_size, value=seg_pad_val)
-            data_sample.set_metainfo(
-                {'pad_shape': data_sample.gt_sem_seg.shape})
+        if data_samples is not None:
+            data_sample = data_samples[i]
+            pad_shape = None
+            if 'gt_sem_seg' in data_sample:
+                gt_sem_seg = data_sample.gt_sem_seg.data
+                del data_sample.gt_sem_seg.data
+                data_sample.gt_sem_seg.data = F.pad(
+                    gt_sem_seg, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_sem_seg.shape
+            if 'gt_edge_map' in data_sample:
+                gt_edge_map = data_sample.gt_edge_map.data
+                del data_sample.gt_edge_map.data
+                data_sample.gt_edge_map.data = F.pad(
+                    gt_edge_map, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_edge_map.shape
+            if 'gt_depth_map' in data_sample:
+                gt_depth_map = data_sample.gt_depth_map.data
+                del data_sample.gt_depth_map.data
+                data_sample.gt_depth_map.data = F.pad(
+                    gt_depth_map, padding_size, value=seg_pad_val)
+                pad_shape = data_sample.gt_depth_map.shape
+            data_sample.set_metainfo({
+                'img_shape': tensor.shape[-2:],
+                'pad_shape': pad_shape,
+                'padding_size': padding_size
+            })
             padded_samples.append(data_sample)
         else:
-            padded_samples = None
+            padded_samples.append(
+                dict(
+                    img_padding_size=padding_size,
+                    pad_shape=pad_img.shape[-2:]))
 
     return torch.stack(padded_inputs, dim=0), padded_samples
diff --git a/mmseg/utils/set_env.py b/mmseg/utils/set_env.py
index 1063a8a73d..c948950d62 100644
--- a/mmseg/utils/set_env.py
+++ b/mmseg/utils/set_env.py
@@ -16,11 +16,11 @@ def register_all_modules(init_default_scope: bool = True) -> None:
             to https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/registry.md
             Defaults to True.
     """  # noqa
-    import mmseg.data  # noqa: F401,F403
     import mmseg.datasets  # noqa: F401,F403
     import mmseg.engine  # noqa: F401,F403
-    import mmseg.metrics  # noqa: F401,F403
+    import mmseg.evaluation  # noqa: F401,F403
     import mmseg.models  # noqa: F401,F403
+    import mmseg.structures  # noqa: F401,F403
 
     if init_default_scope:
         never_created = DefaultScope.get_current_instance() is None \
diff --git a/mmseg/utils/tokenizer.py b/mmseg/utils/tokenizer.py
new file mode 100644
index 0000000000..d56f5fae60
--- /dev/null
+++ b/mmseg/utils/tokenizer.py
@@ -0,0 +1,240 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""CLIP tokenizer.
+
+Copied from https://github.com/openai/CLIP. Originally MIT License, Copyright
+(c) 2021 OpenAI.
+"""
+import gzip
+import html
+import os
+from functools import lru_cache
+from typing import List, Union
+
+import ftfy
+import regex as re
+import torch
+
+os.environ['TOKENIZERS_PARALLELISM'] = 'false'
+
+
+@lru_cache()
+def default_bpe():
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """Returns list of utf-8 byte and a corresponding list of unicode strings.
+
+    The reversible bpe codes work on unicode strings. This means you need a
+    large # of unicode characters in your vocab if you want to avoid UNKs. When
+    you're at something like a 10B token dataset you end up needing around 5K
+    for decent coverage. This is a significant percentage of your normal, say,
+    32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
+    unicode strings. And avoids mapping to whitespace/control characters the
+    bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length
+    strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer:
+
+    def __init__(self, bpe_path: str = default_bpe(), special_tokens=None):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        if not special_tokens:
+            special_tokens = ['<start_of_text>', '<end_of_text>']
+        else:
+            special_tokens = ['<start_of_text>', '<end_of_text>'
+                              ] + special_tokens
+        vocab.extend(special_tokens)
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {t: t for t in special_tokens}
+        special = '|'.join(special_tokens)
+        self.pat = re.compile(
+            special +
+            r"""|'s|'t|'re|'ve|'m|'ll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""",
+            re.IGNORECASE)
+
+        self.vocab_size = len(self.encoder)
+        self.all_special_ids = [self.encoder[t] for t in special_tokens]
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except:  # noqa: E722, E261
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
+
+
+_tokenizer = SimpleTokenizer()
+
+
+def decode(output_ids: torch.Tensor):
+    output_ids = output_ids.cpu().numpy()
+    return _tokenizer.decode(output_ids)
+
+
+def tokenize(texts: Union[str, List[str]],
+             context_length: int = 77) -> torch.LongTensor:
+    """Returns the tokenized representation of given input string(s)
+
+    Parameters
+    ----------
+    texts : Union[str, List[str]]
+        An input string or a list of input strings to tokenize
+    context_length : int
+        The context length to use; all CLIP models use 77 as the context length
+
+    Returns
+    -------
+    A two-dimensional tensor containing the resulting tokens,
+    shape = [number of input strings, context_length]
+    """
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder['<start_of_text>']
+    eot_token = _tokenizer.encoder['<end_of_text>']
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            tokens = tokens[:context_length]  # Truncate
+            tokens[-1] = eot_token
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
+
+
+class HFTokenizer:
+    """HuggingFace tokenizer wrapper."""
+
+    def __init__(self, tokenizer_name: str):
+        from transformers import AutoTokenizer
+        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
+
+    def save_pretrained(self, dest):
+        self.tokenizer.save_pretrained(dest)
+
+    def __call__(self,
+                 texts: Union[str, List[str]],
+                 context_length: int = 77) -> torch.Tensor:
+        # same cleaning as for default tokenizer, except lowercasing
+        # adding lower (for case-sensitive tokenizers) will make it
+        # more robust but less sensitive to nuance
+        if isinstance(texts, str):
+            texts = [texts]
+        texts = [whitespace_clean(basic_clean(text)) for text in texts]
+        input_ids = self.tokenizer(
+            texts,
+            return_tensors='pt',
+            max_length=context_length,
+            padding='max_length',
+            truncation=True,
+        ).input_ids
+        return input_ids
diff --git a/mmseg/utils/typing.py b/mmseg/utils/typing.py
deleted file mode 100644
index 4f148dc71f..0000000000
--- a/mmseg/utils/typing.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-"""Collecting some commonly used type hint in mmflow."""
-from typing import Dict, List, Optional, Sequence, Tuple, Union
-
-import torch
-from mmengine.config import ConfigDict
-
-from mmseg.data import SegDataSample
-
-# Type hint of config data
-ConfigType = Union[ConfigDict, dict]
-OptConfigType = Optional[ConfigType]
-# Type hint of one or more config data
-MultiConfig = Union[ConfigType, Sequence[ConfigType]]
-OptMultiConfig = Optional[MultiConfig]
-
-SampleList = Sequence[SegDataSample]
-OptSampleList = Optional[SampleList]
-
-# Type hint of Tensor
-TensorDict = Dict[str, torch.Tensor]
-TensorList = Sequence[torch.Tensor]
-
-ForwardResults = Union[Dict[str, torch.Tensor], List[SegDataSample],
-                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/mmseg/utils/typing_utils.py b/mmseg/utils/typing_utils.py
new file mode 100644
index 0000000000..fba7d3b92b
--- /dev/null
+++ b/mmseg/utils/typing_utils.py
@@ -0,0 +1,25 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Collecting some commonly used type hint in mmflow."""
+from typing import Dict, List, Optional, Sequence, Tuple, Union
+
+import torch
+from mmengine.config import ConfigDict
+
+from mmseg.structures import SegDataSample
+
+# Type hint of config data
+ConfigType = Union[ConfigDict, dict]
+OptConfigType = Optional[ConfigType]
+# Type hint of one or more config data
+MultiConfig = Union[ConfigType, Sequence[ConfigType]]
+OptMultiConfig = Optional[MultiConfig]
+
+SampleList = Sequence[SegDataSample]
+OptSampleList = Optional[SampleList]
+
+# Type hint of Tensor
+TensorDict = Dict[str, torch.Tensor]
+TensorList = Sequence[torch.Tensor]
+
+ForwardResults = Union[Dict[str, torch.Tensor], List[SegDataSample],
+                       Tuple[torch.Tensor], torch.Tensor]
diff --git a/mmseg/version.py b/mmseg/version.py
index e05146f0a0..b76bb4580d 100644
--- a/mmseg/version.py
+++ b/mmseg/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 
-__version__ = '0.24.1'
+__version__ = '1.2.2'
 
 
 def parse_version_info(version_str):
diff --git a/mmseg/visualization/__init__.py b/mmseg/visualization/__init__.py
new file mode 100644
index 0000000000..8cbb211e52
--- /dev/null
+++ b/mmseg/visualization/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .local_visualizer import SegLocalVisualizer
+
+__all__ = ['SegLocalVisualizer']
diff --git a/mmseg/visualization/local_visualizer.py b/mmseg/visualization/local_visualizer.py
new file mode 100644
index 0000000000..ee3d652c7b
--- /dev/null
+++ b/mmseg/visualization/local_visualizer.py
@@ -0,0 +1,349 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Dict, List, Optional
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmengine.dist import master_only
+from mmengine.structures import PixelData
+from mmengine.visualization import Visualizer
+
+from mmseg.registry import VISUALIZERS
+from mmseg.structures import SegDataSample
+from mmseg.utils import get_classes, get_palette
+
+
+@VISUALIZERS.register_module()
+class SegLocalVisualizer(Visualizer):
+    """Local Visualizer.
+
+    Args:
+        name (str): Name of the instance. Defaults to 'visualizer'.
+        image (np.ndarray, optional): the origin image to draw. The format
+            should be RGB. Defaults to None.
+        vis_backends (list, optional): Visual backend config list.
+            Defaults to None.
+        save_dir (str, optional): Save file dir for all storage backends.
+            If it is None, the backend storage will not save any data.
+        classes (list, optional): Input classes for result rendering, as the
+            prediction of segmentation model is a segment map with label
+            indices, `classes` is a list which includes items responding to the
+            label indices. If classes is not defined, visualizer will take
+            `cityscapes` classes by default. Defaults to None.
+        palette (list, optional): Input palette for result rendering, which is
+            a list of color palette responding to the classes. Defaults to None.
+        dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+            visulizer will use the meta information of the dataset i.e. classes
+            and palette, but the `classes` and `palette` have higher priority.
+            Defaults to None.
+        alpha (int, float): The transparency of segmentation mask.
+                Defaults to 0.8.
+
+    Examples:
+        >>> import numpy as np
+        >>> import torch
+        >>> from mmengine.structures import PixelData
+        >>> from mmseg.structures import SegDataSample
+        >>> from mmseg.visualization import SegLocalVisualizer
+
+        >>> seg_local_visualizer = SegLocalVisualizer()
+        >>> image = np.random.randint(0, 256,
+        ...                     size=(10, 12, 3)).astype('uint8')
+        >>> gt_sem_seg_data = dict(data=torch.randint(0, 2, (1, 10, 12)))
+        >>> gt_sem_seg = PixelData(**gt_sem_seg_data)
+        >>> gt_seg_data_sample = SegDataSample()
+        >>> gt_seg_data_sample.gt_sem_seg = gt_sem_seg
+        >>> seg_local_visualizer.dataset_meta = dict(
+        >>>     classes=('background', 'foreground'),
+        >>>     palette=[[120, 120, 120], [6, 230, 230]])
+        >>> seg_local_visualizer.add_datasample('visualizer_example',
+        ...                         image, gt_seg_data_sample)
+        >>> seg_local_visualizer.add_datasample(
+        ...                        'visualizer_example', image,
+        ...                         gt_seg_data_sample, show=True)
+    """  # noqa
+
+    def __init__(self,
+                 name: str = 'visualizer',
+                 image: Optional[np.ndarray] = None,
+                 vis_backends: Optional[Dict] = None,
+                 save_dir: Optional[str] = None,
+                 classes: Optional[List] = None,
+                 palette: Optional[List] = None,
+                 dataset_name: Optional[str] = None,
+                 alpha: float = 0.8,
+                 **kwargs):
+        super().__init__(name, image, vis_backends, save_dir, **kwargs)
+        self.alpha: float = alpha
+        self.set_dataset_meta(palette, classes, dataset_name)
+
+    def _get_center_loc(self, mask: np.ndarray) -> np.ndarray:
+        """Get semantic seg center coordinate.
+
+        Args:
+            mask: np.ndarray: get from sem_seg
+        """
+        loc = np.argwhere(mask == 1)
+
+        loc_sort = np.array(
+            sorted(loc.tolist(), key=lambda row: (row[0], row[1])))
+        y_list = loc_sort[:, 0]
+        unique, indices, counts = np.unique(
+            y_list, return_index=True, return_counts=True)
+        y_loc = unique[counts.argmax()]
+        y_most_freq_loc = loc[loc_sort[:, 0] == y_loc]
+        center_num = len(y_most_freq_loc) // 2
+        x = y_most_freq_loc[center_num][1]
+        y = y_most_freq_loc[center_num][0]
+        return np.array([x, y])
+
+    def _draw_sem_seg(self,
+                      image: np.ndarray,
+                      sem_seg: PixelData,
+                      classes: Optional[List],
+                      palette: Optional[List],
+                      with_labels: Optional[bool] = True) -> np.ndarray:
+        """Draw semantic seg of GT or prediction.
+
+        Args:
+            image (np.ndarray): The image to draw.
+            sem_seg (:obj:`PixelData`): Data structure for pixel-level
+                annotations or predictions.
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+            with_labels(bool, optional): Add semantic labels in visualization
+                result, Default to True.
+
+        Returns:
+            np.ndarray: the drawn image which channel is RGB.
+        """
+        num_classes = len(classes)
+
+        sem_seg = sem_seg.cpu().data
+        ids = np.unique(sem_seg)[::-1]
+        legal_indices = ids < num_classes
+        ids = ids[legal_indices]
+        labels = np.array(ids, dtype=np.int64)
+
+        colors = [palette[label] for label in labels]
+
+        mask = np.zeros_like(image, dtype=np.uint8)
+        for label, color in zip(labels, colors):
+            mask[sem_seg[0] == label, :] = color
+
+        if with_labels:
+            font = cv2.FONT_HERSHEY_SIMPLEX
+            # (0,1] to change the size of the text relative to the image
+            scale = 0.05
+            fontScale = min(image.shape[0], image.shape[1]) / (25 / scale)
+            fontColor = (255, 255, 255)
+            if image.shape[0] < 300 or image.shape[1] < 300:
+                thickness = 1
+                rectangleThickness = 1
+            else:
+                thickness = 2
+                rectangleThickness = 2
+            lineType = 2
+
+            if isinstance(sem_seg[0], torch.Tensor):
+                masks = sem_seg[0].numpy() == labels[:, None, None]
+            else:
+                masks = sem_seg[0] == labels[:, None, None]
+            masks = masks.astype(np.uint8)
+            for mask_num in range(len(labels)):
+                classes_id = labels[mask_num]
+                classes_color = colors[mask_num]
+                loc = self._get_center_loc(masks[mask_num])
+                text = classes[classes_id]
+                (label_width, label_height), baseline = cv2.getTextSize(
+                    text, font, fontScale, thickness)
+                mask = cv2.rectangle(mask, loc,
+                                     (loc[0] + label_width + baseline,
+                                      loc[1] + label_height + baseline),
+                                     classes_color, -1)
+                mask = cv2.rectangle(mask, loc,
+                                     (loc[0] + label_width + baseline,
+                                      loc[1] + label_height + baseline),
+                                     (0, 0, 0), rectangleThickness)
+                mask = cv2.putText(mask, text, (loc[0], loc[1] + label_height),
+                                   font, fontScale, fontColor, thickness,
+                                   lineType)
+        color_seg = (image * (1 - self.alpha) + mask * self.alpha).astype(
+            np.uint8)
+        self.set_image(color_seg)
+        return color_seg
+
+    def _draw_depth_map(self, image: np.ndarray,
+                        depth_map: PixelData) -> np.ndarray:
+        """Draws a depth map on a given image.
+
+        This function takes an image and a depth map as input,
+        renders the depth map, and concatenates it with the original image.
+        Finally, it updates the internal image state of the visualizer with
+        the concatenated result.
+
+        Args:
+            image (np.ndarray): The original image where the depth map will
+                be drawn. The array should be in the format HxWx3 where H is
+                the height, W is the width.
+
+            depth_map (PixelData): Depth map to be drawn. The depth map
+                should be in the form of a PixelData object. It will be
+                converted to a torch tensor if it is a numpy array.
+
+        Returns:
+            np.ndarray: The concatenated image with the depth map drawn.
+
+        Example:
+            >>> depth_map_data = PixelData(data=torch.rand(1, 10, 10))
+            >>> image = np.random.randint(0, 256,
+            >>>                           size=(10, 10, 3)).astype('uint8')
+            >>> visualizer = SegLocalVisualizer()
+            >>> visualizer._draw_depth_map(image, depth_map_data)
+        """
+        depth_map = depth_map.cpu().data
+        if isinstance(depth_map, np.ndarray):
+            depth_map = torch.from_numpy(depth_map)
+        if depth_map.ndim == 2:
+            depth_map = depth_map[None]
+
+        depth_map = self.draw_featmap(depth_map, resize_shape=image.shape[:2])
+        out_image = np.concatenate((image, depth_map), axis=0)
+        self.set_image(out_image)
+        return out_image
+
+    def set_dataset_meta(self,
+                         classes: Optional[List] = None,
+                         palette: Optional[List] = None,
+                         dataset_name: Optional[str] = None) -> None:
+        """Set meta information to visualizer.
+
+        Args:
+            classes (list, optional): Input classes for result rendering, as
+                the prediction of segmentation model is a segment map with
+                label indices, `classes` is a list which includes items
+                responding to the label indices. If classes is not defined,
+                visualizer will take `cityscapes` classes by default.
+                Defaults to None.
+            palette (list, optional): Input palette for result rendering, which
+                is a list of color palette responding to the classes.
+                Defaults to None.
+            dataset_name (str, optional): `Dataset name or alias <https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/class_names.py#L302-L317>`_
+                visulizer will use the meta information of the dataset i.e.
+                classes and palette, but the `classes` and `palette` have
+                higher priority. Defaults to None.
+        """  # noqa
+        # Set default value. When calling
+        # `SegLocalVisualizer().dataset_meta=xxx`,
+        # it will override the default value.
+        if dataset_name is None:
+            dataset_name = 'cityscapes'
+        classes = classes if classes else get_classes(dataset_name)
+        palette = palette if palette else get_palette(dataset_name)
+        assert len(classes) == len(
+            palette), 'The length of classes should be equal to palette'
+        self.dataset_meta: dict = {'classes': classes, 'palette': palette}
+
+    @master_only
+    def add_datasample(
+            self,
+            name: str,
+            image: np.ndarray,
+            data_sample: Optional[SegDataSample] = None,
+            draw_gt: bool = True,
+            draw_pred: bool = True,
+            show: bool = False,
+            wait_time: float = 0,
+            # TODO: Supported in mmengine's Viusalizer.
+            out_file: Optional[str] = None,
+            step: int = 0,
+            with_labels: Optional[bool] = True) -> None:
+        """Draw datasample and save to all backends.
+
+        - If GT and prediction are plotted at the same time, they are
+        displayed in a stitched image where the left image is the
+        ground truth and the right image is the prediction.
+        - If ``show`` is True, all storage backends are ignored, and
+        the images will be displayed in a local window.
+        - If ``out_file`` is specified, the drawn image will be
+        saved to ``out_file``. it is usually used when the display
+        is not available.
+
+        Args:
+            name (str): The image identifier.
+            image (np.ndarray): The image to draw.
+            gt_sample (:obj:`SegDataSample`, optional): GT SegDataSample.
+                Defaults to None.
+            pred_sample (:obj:`SegDataSample`, optional): Prediction
+                SegDataSample. Defaults to None.
+            draw_gt (bool): Whether to draw GT SegDataSample. Default to True.
+            draw_pred (bool): Whether to draw Prediction SegDataSample.
+                Defaults to True.
+            show (bool): Whether to display the drawn image. Default to False.
+            wait_time (float): The interval of show (s). Defaults to 0.
+            out_file (str): Path to output file. Defaults to None.
+            step (int): Global step value to record. Defaults to 0.
+            with_labels(bool, optional): Add semantic labels in visualization
+                result, Defaults to True.
+        """
+        classes = self.dataset_meta.get('classes', None)
+        palette = self.dataset_meta.get('palette', None)
+
+        gt_img_data = None
+        pred_img_data = None
+
+        if draw_gt and data_sample is not None:
+            if 'gt_sem_seg' in data_sample:
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing semantic ' \
+                                            'segmentation results.'
+                gt_img_data = self._draw_sem_seg(image, data_sample.gt_sem_seg,
+                                                 classes, palette, with_labels)
+
+            if 'gt_depth_map' in data_sample:
+                gt_img_data = gt_img_data if gt_img_data is not None else image
+                gt_img_data = self._draw_depth_map(gt_img_data,
+                                                   data_sample.gt_depth_map)
+
+        if draw_pred and data_sample is not None:
+
+            if 'pred_sem_seg' in data_sample:
+
+                assert classes is not None, 'class information is ' \
+                                            'not provided when ' \
+                                            'visualizing semantic ' \
+                                            'segmentation results.'
+                pred_img_data = self._draw_sem_seg(image,
+                                                   data_sample.pred_sem_seg,
+                                                   classes, palette,
+                                                   with_labels)
+
+            if 'pred_depth_map' in data_sample:
+                pred_img_data = pred_img_data if pred_img_data is not None \
+                    else image
+                pred_img_data = self._draw_depth_map(
+                    pred_img_data, data_sample.pred_depth_map)
+
+        if gt_img_data is not None and pred_img_data is not None:
+            drawn_img = np.concatenate((gt_img_data, pred_img_data), axis=1)
+        elif gt_img_data is not None:
+            drawn_img = gt_img_data
+        else:
+            drawn_img = pred_img_data
+
+        if show:
+            self.show(drawn_img, win_name=name, wait_time=wait_time)
+
+        if out_file is not None:
+            mmcv.imwrite(mmcv.rgb2bgr(drawn_img), out_file)
+        else:
+            self.add_image(name, drawn_img, step)
diff --git a/model-index.yml b/model-index.yml
index 2053fd0496..4026bb9e6e 100644
--- a/model-index.yml
+++ b/model-index.yml
@@ -1,45 +1,53 @@
 Import:
-- configs/ann/ann.yml
-- configs/apcnet/apcnet.yml
-- configs/beit/beit.yml
-- configs/bisenetv1/bisenetv1.yml
-- configs/bisenetv2/bisenetv2.yml
-- configs/ccnet/ccnet.yml
-- configs/cgnet/cgnet.yml
-- configs/convnext/convnext.yml
-- configs/danet/danet.yml
-- configs/deeplabv3/deeplabv3.yml
-- configs/deeplabv3plus/deeplabv3plus.yml
-- configs/dmnet/dmnet.yml
-- configs/dnlnet/dnlnet.yml
-- configs/dpt/dpt.yml
-- configs/emanet/emanet.yml
-- configs/encnet/encnet.yml
-- configs/erfnet/erfnet.yml
-- configs/fastfcn/fastfcn.yml
-- configs/fastscnn/fastscnn.yml
-- configs/fcn/fcn.yml
-- configs/gcnet/gcnet.yml
-- configs/hrnet/hrnet.yml
-- configs/icnet/icnet.yml
-- configs/isanet/isanet.yml
-- configs/knet/knet.yml
-- configs/mae/mae.yml
-- configs/mobilenet_v2/mobilenet_v2.yml
-- configs/mobilenet_v3/mobilenet_v3.yml
-- configs/nonlocal_net/nonlocal_net.yml
-- configs/ocrnet/ocrnet.yml
-- configs/point_rend/point_rend.yml
-- configs/psanet/psanet.yml
-- configs/pspnet/pspnet.yml
-- configs/resnest/resnest.yml
-- configs/segformer/segformer.yml
-- configs/segmenter/segmenter.yml
-- configs/sem_fpn/sem_fpn.yml
-- configs/setr/setr.yml
-- configs/stdc/stdc.yml
-- configs/swin/swin.yml
-- configs/twins/twins.yml
-- configs/unet/unet.yml
-- configs/upernet/upernet.yml
-- configs/vit/vit.yml
+- configs/ann/metafile.yaml
+- configs/apcnet/metafile.yaml
+- configs/beit/metafile.yaml
+- configs/bisenetv1/metafile.yaml
+- configs/bisenetv2/metafile.yaml
+- configs/ccnet/metafile.yaml
+- configs/cgnet/metafile.yaml
+- configs/convnext/metafile.yaml
+- configs/danet/metafile.yaml
+- configs/ddrnet/metafile.yaml
+- configs/deeplabv3/metafile.yaml
+- configs/deeplabv3plus/metafile.yaml
+- configs/dmnet/metafile.yaml
+- configs/dnlnet/metafile.yaml
+- configs/dpt/metafile.yaml
+- configs/emanet/metafile.yaml
+- configs/encnet/metafile.yaml
+- configs/erfnet/metafile.yaml
+- configs/fastfcn/metafile.yaml
+- configs/fastscnn/metafile.yaml
+- configs/fcn/metafile.yaml
+- configs/gcnet/metafile.yaml
+- configs/hrnet/metafile.yaml
+- configs/icnet/metafile.yaml
+- configs/isanet/metafile.yaml
+- configs/knet/metafile.yaml
+- configs/mae/metafile.yaml
+- configs/mask2former/metafile.yaml
+- configs/maskformer/metafile.yaml
+- configs/mobilenet_v2/metafile.yaml
+- configs/mobilenet_v3/metafile.yaml
+- configs/nonlocal_net/metafile.yaml
+- configs/ocrnet/metafile.yaml
+- configs/pidnet/metafile.yaml
+- configs/point_rend/metafile.yaml
+- configs/poolformer/metafile.yaml
+- configs/psanet/metafile.yaml
+- configs/pspnet/metafile.yaml
+- configs/resnest/metafile.yaml
+- configs/san/metafile.yaml
+- configs/segformer/metafile.yaml
+- configs/segmenter/metafile.yaml
+- configs/segnext/metafile.yaml
+- configs/sem_fpn/metafile.yaml
+- configs/setr/metafile.yaml
+- configs/stdc/metafile.yaml
+- configs/swin/metafile.yaml
+- configs/twins/metafile.yaml
+- configs/unet/metafile.yaml
+- configs/upernet/metafile.yaml
+- configs/vit/metafile.yaml
+- configs/vpd/metafile.yaml
diff --git a/projects/Adabins/README.md b/projects/Adabins/README.md
new file mode 100644
index 0000000000..8a23e92d74
--- /dev/null
+++ b/projects/Adabins/README.md
@@ -0,0 +1,46 @@
+# AdaBins: Depth Estimation Using Adaptive Bins
+
+## Reference
+
+> [AdaBins: Depth Estimation Using Adaptive Bins](https://arxiv.org/abs/2011.14141)
+
+## Introduction
+
+<a href="https://github.com/shariqfarooq123/AdaBins">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/projects/Adabins">Code Snippet</a>
+
+## <img src="https://user-images.githubusercontent.com/34859558/190043857-bfbdaf8b-d2dc-4fff-81c7-e0aac50851f9.png" width="25"/> Abstract
+
+We address the problem of estimating a high quality dense depth map from a single RGB input image. We start out with a baseline encoder-decoder convolutional neural network architecture and pose the question of how the global processing of information can help improve overall depth estimation. To this end, we propose a transformer-based architecture block that divides the depth range into bins whose center value is estimated adaptively per image. The final depth values are estimated as linear combinations of the bin centers. We call our new building block AdaBins. Our results show a decisive improvement over the state-of-the-art on several popular depth datasets across all metrics.We also validate the effectiveness of the proposed block with an ablation study and provide the code and corresponding pre-trained weights of the new state-of-the-art model.
+
+Our main contributions are the following:
+
+- We propose an architecture building block that performs global processing of the scene’s information.We propose to divide the predicted depth range into bins where the bin widths change per image. The final depth estimation is a linear combination of the bin center values.
+- We show a decisive improvement for supervised single image depth estimation across all metrics for the two most popular datasets, NYU and KITTI.
+- We analyze our findings and investigate different modifications on the proposed AdaBins block and study their effect on the accuracy of the depth estimation.
+
+<div align="center">
+<img src="https://github.com/open-mmlab/mmsegmentation/assets/15952744/915bcd5a-9dc2-4602-a6e7-055ff5d4889f"  width = "1000" />
+</div>
+
+## <img src="https://user-images.githubusercontent.com/34859558/190044217-8f6befc2-7f20-473d-b356-148e06265205.png" width="25"/> Performance
+
+### NYU and KITTI
+
+| Model         | Encoder         | Training epoch | Batchsize | Train Resolution | δ1    | δ2    | δ3    | REL   | RMS   | RMS log | params(M) | Links                                                                                                                   |
+| ------------- | --------------- | -------------- | --------- | ---------------- | ----- | ----- | ----- | ----- | ----- | ------- | --------- | ----------------------------------------------------------------------------------------------------------------------- |
+| AdaBins_nyu   | EfficientNet-B5 | 25             | 16        | 416x544          | 0.903 | 0.984 | 0.997 | 0.103 | 0.364 | 0.044   | 78        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/adabins/adabins_efficient_b5_nyu_third-party-f68d6bd3.pth)   |
+| AdaBins_kitti | EfficientNet-B5 | 25             | 16        | 352x764          | 0.964 | 0.995 | 0.999 | 0.058 | 2.360 | 0.088   | 78        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/adabins/adabins_efficient-b5_kitty_third-party-a1aa6f36.pth) |
+
+## Citation
+
+```bibtex
+@article{10.1109/cvpr46437.2021.00400,
+    author = {Bhat, S. A. and Alhashim, I. and Wonka, P.},
+    title = {Adabins: depth estimation using adaptive bins},
+    journal = {2021 IEEE/CVF Conference on Computer Vision and Pattern Recognition (CVPR)},
+    year = {2021},
+    doi = {10.1109/cvpr46437.2021.00400}
+}
+```
diff --git a/projects/Adabins/backbones/__init__.py b/projects/Adabins/backbones/__init__.py
new file mode 100644
index 0000000000..04ae180be5
--- /dev/null
+++ b/projects/Adabins/backbones/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .adabins_backbone import AdabinsBackbone
+
+__all__ = ['AdabinsBackbone']
diff --git a/projects/Adabins/backbones/adabins_backbone.py b/projects/Adabins/backbones/adabins_backbone.py
new file mode 100644
index 0000000000..07d73809e3
--- /dev/null
+++ b/projects/Adabins/backbones/adabins_backbone.py
@@ -0,0 +1,141 @@
+import timm
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_conv_layer
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+
+
+class UpSampleBN(nn.Module):
+    """ UpSample module
+    Args:
+        skip_input (int): the input feature
+        output_features (int): the output feature
+        norm_cfg (dict, optional): Config dict for normalization layer.
+            Default: dict(type='BN', requires_grad=True).
+        act_cfg (dict, optional): The activation layer of AAM:
+            Aggregate Attention Module.
+    """
+
+    def __init__(self,
+                 skip_input,
+                 output_features,
+                 norm_cfg=dict(type='BN'),
+                 act_cfg=dict(type='LeakyReLU')):
+        super().__init__()
+
+        self._net = nn.Sequential(
+            ConvModule(
+                in_channels=skip_input,
+                out_channels=output_features,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=True,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+            ),
+            ConvModule(
+                in_channels=output_features,
+                out_channels=output_features,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                bias=True,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+            ))
+
+    def forward(self, x, concat_with):
+        up_x = F.interpolate(
+            x,
+            size=[concat_with.size(2),
+                  concat_with.size(3)],
+            mode='bilinear',
+            align_corners=True)
+        f = torch.cat([up_x, concat_with], dim=1)
+        return self._net(f)
+
+
+class Encoder(nn.Module):
+    """ the efficientnet_b5 model
+    Args:
+        basemodel_name (str): the name of base model
+    """
+
+    def __init__(self, basemodel_name):
+        super().__init__()
+        self.original_model = timm.create_model(
+            basemodel_name, pretrained=True)
+        # Remove last layer
+        self.original_model.global_pool = nn.Identity()
+        self.original_model.classifier = nn.Identity()
+
+    def forward(self, x):
+        features = [x]
+        for k, v in self.original_model._modules.items():
+            if k == 'blocks':
+                for ki, vi in v._modules.items():
+                    features.append(vi(features[-1]))
+            else:
+                features.append(v(features[-1]))
+        return features
+
+
+@MODELS.register_module()
+class AdabinsBackbone(BaseModule):
+    """ the backbone of the adabins
+    Args:
+        basemodel_name (str):the name of base model
+        num_features (int): the middle feature
+        num_classes (int): the classes number
+        bottleneck_features (int): the bottleneck features
+        conv_cfg (dict): Config dict for convolution layer.
+    """
+
+    def __init__(self,
+                 basemodel_name,
+                 num_features=2048,
+                 num_classes=128,
+                 bottleneck_features=2048,
+                 conv_cfg=dict(type='Conv')):
+        super().__init__()
+        self.encoder = Encoder(basemodel_name)
+        features = int(num_features)
+        self.conv2 = build_conv_layer(
+            conv_cfg,
+            bottleneck_features,
+            features,
+            kernel_size=1,
+            stride=1,
+            padding=1)
+        self.up1 = UpSampleBN(
+            skip_input=features // 1 + 112 + 64, output_features=features // 2)
+        self.up2 = UpSampleBN(
+            skip_input=features // 2 + 40 + 24, output_features=features // 4)
+        self.up3 = UpSampleBN(
+            skip_input=features // 4 + 24 + 16, output_features=features // 8)
+        self.up4 = UpSampleBN(
+            skip_input=features // 8 + 16 + 8, output_features=features // 16)
+
+        self.conv3 = build_conv_layer(
+            conv_cfg,
+            features // 16,
+            num_classes,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+
+    def forward(self, x):
+        features = self.encoder(x)
+        x_block0, x_block1, x_block2, x_block3, x_block4 = features[
+            3], features[4], features[5], features[7], features[10]
+        x_d0 = self.conv2(x_block4)
+        x_d1 = self.up1(x_d0, x_block3)
+        x_d2 = self.up2(x_d1, x_block2)
+        x_d3 = self.up3(x_d2, x_block1)
+        x_d4 = self.up4(x_d3, x_block0)
+        out = self.conv3(x_d4)
+        return out
diff --git a/projects/Adabins/configs/_base_/datasets/nyu.py b/projects/Adabins/configs/_base_/datasets/nyu.py
new file mode 100644
index 0000000000..1b49ec7e8d
--- /dev/null
+++ b/projects/Adabins/configs/_base_/datasets/nyu.py
@@ -0,0 +1,32 @@
+dataset_type = 'NYUDataset'
+data_root = 'data/nyu'
+
+test_pipeline = [
+    dict(dict(type='LoadImageFromFile', to_float32=True)),
+    dict(dict(type='LoadDepthAnnotation', depth_rescale_factor=1e-3)),
+    dict(
+        type='PackSegInputs',
+        meta_keys=('img_path', 'depth_map_path', 'ori_shape', 'img_shape',
+                   'pad_shape', 'scale_factor', 'flip', 'flip_direction',
+                   'category_id'))
+]
+
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        test_mode=True,
+        data_prefix=dict(
+            img_path='images/test', depth_map_path='annotations/test'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='DepthMetric', max_depth_eval=10.0, crop_type='nyu_crop')
+test_evaluator = val_evaluator
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
diff --git a/projects/Adabins/configs/_base_/default_runtime.py b/projects/Adabins/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000..272b4d2467
--- /dev/null
+++ b/projects/Adabins/configs/_base_/default_runtime.py
@@ -0,0 +1,15 @@
+default_scope = 'mmseg'
+env_cfg = dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(by_epoch=False)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+tta_model = dict(type='SegTTAModel')
diff --git a/projects/Adabins/configs/_base_/models/Adabins.py b/projects/Adabins/configs/_base_/models/Adabins.py
new file mode 100644
index 0000000000..35cbd8c577
--- /dev/null
+++ b/projects/Adabins/configs/_base_/models/Adabins.py
@@ -0,0 +1,35 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='DepthEstimator',
+    data_preprocessor=data_preprocessor,
+    # pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='AdabinsBackbone',
+        basemodel_name='tf_efficientnet_b5_ap',
+        num_features=2048,
+        num_classes=128,
+        bottleneck_features=2048,
+    ),
+    decode_head=dict(
+        type='AdabinsHead',
+        in_channels=128,
+        n_query_channels=128,
+        patch_size=16,
+        embedding_dim=128,
+        num_heads=4,
+        n_bins=256,
+        min_val=0.001,
+        max_val=10,
+        norm='linear'),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/projects/Adabins/configs/adabins/adabins_efficient_b5_4x16_25e_NYU_416x544.py b/projects/Adabins/configs/adabins/adabins_efficient_b5_4x16_25e_NYU_416x544.py
new file mode 100644
index 0000000000..5c00ea152b
--- /dev/null
+++ b/projects/Adabins/configs/adabins/adabins_efficient_b5_4x16_25e_NYU_416x544.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../_base_/models/Adabins.py', '../_base_/datasets/nyu.py',
+    '../_base_/default_runtime.py'
+]
+custom_imports = dict(
+    imports=['projects.Adabins.backbones', 'projects.Adabins.decode_head'],
+    allow_failed_imports=False)
+crop_size = (416, 544)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(),
+    decode_head=dict(),
+)
diff --git a/projects/Adabins/configs/adabins/adabins_efficient_b5_4x16_25e_kitti_352x704.py b/projects/Adabins/configs/adabins/adabins_efficient_b5_4x16_25e_kitti_352x704.py
new file mode 100644
index 0000000000..330cdf41a5
--- /dev/null
+++ b/projects/Adabins/configs/adabins/adabins_efficient_b5_4x16_25e_kitti_352x704.py
@@ -0,0 +1,12 @@
+_base_ = ['../_base_/models/Adabins.py']
+custom_imports = dict(
+    imports=['projects.Adabins.backbones', 'projects.Adabins.decode_head'],
+    allow_failed_imports=False)
+crop_size = (352, 704)
+data_preprocessor = dict(size=crop_size)
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(),
+    decode_head=dict(min_val=0.001, max_val=80),
+)
diff --git a/projects/Adabins/decode_head/__init__.py b/projects/Adabins/decode_head/__init__.py
new file mode 100644
index 0000000000..c7d62df12b
--- /dev/null
+++ b/projects/Adabins/decode_head/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .adabins_head import AdabinsHead
+
+__all__ = ['AdabinsHead']
diff --git a/projects/Adabins/decode_head/adabins_head.py b/projects/Adabins/decode_head/adabins_head.py
new file mode 100644
index 0000000000..ee043172ab
--- /dev/null
+++ b/projects/Adabins/decode_head/adabins_head.py
@@ -0,0 +1,179 @@
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_conv_layer
+from torch import Tensor
+
+from mmseg.registry import MODELS
+
+
+class PatchTransformerEncoder(nn.Module):
+    """the Patch Transformer Encoder.
+
+    Args:
+        in_channels (int): the channels of input
+        patch_size (int): the path size
+        embedding_dim (int): The feature dimension.
+        num_heads (int): the number of encoder head
+        conv_cfg (dict): Config dict for convolution layer.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 patch_size=10,
+                 embedding_dim=128,
+                 num_heads=4,
+                 conv_cfg=dict(type='Conv')):
+        super().__init__()
+        encoder_layers = nn.TransformerEncoderLayer(
+            embedding_dim, num_heads, dim_feedforward=1024)
+        self.transformer_encoder = nn.TransformerEncoder(
+            encoder_layers, num_layers=4)  # takes shape S,N,E
+
+        self.embedding_convPxP = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            embedding_dim,
+            kernel_size=patch_size,
+            stride=patch_size)
+        self.positional_encodings = nn.Parameter(
+            torch.rand(500, embedding_dim), requires_grad=True)
+
+    def forward(self, x):
+        embeddings = self.embedding_convPxP(x).flatten(
+            2)  # .shape = n,c,s = n, embedding_dim, s
+        embeddings = embeddings + self.positional_encodings[:embeddings.shape[
+            2], :].T.unsqueeze(0)
+
+        # change to S,N,E format required by transformer
+        embeddings = embeddings.permute(2, 0, 1)
+        x = self.transformer_encoder(embeddings)  # .shape = S, N, E
+        return x
+
+
+class PixelWiseDotProduct(nn.Module):
+    """the pixel wise dot product."""
+
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, x, K):
+        n, c, h, w = x.size()
+        _, cout, ck = K.size()
+        assert c == ck, 'Number of channels in x and Embedding dimension ' \
+                        '(at dim 2) of K matrix must match'
+        y = torch.matmul(
+            x.view(n, c, h * w).permute(0, 2, 1),
+            K.permute(0, 2, 1))  # .shape = n, hw, cout
+        return y.permute(0, 2, 1).view(n, cout, h, w)
+
+
+@MODELS.register_module()
+class AdabinsHead(nn.Module):
+    """the head of the adabins,include mViT.
+
+    Args:
+        in_channels (int):the channels of the input
+        n_query_channels (int):the channels of the query
+        patch_size (int): the patch size
+        embedding_dim (int):The feature dimension.
+        num_heads (int):the number of head
+        n_bins (int):the number of bins
+        min_val (float): the min width of bin
+        max_val (float): the max width of bin
+        conv_cfg (dict): Config dict for convolution layer.
+        norm (str): the activate method
+        align_corners (bool, optional): Geometrically, we consider the pixels
+            of the input and output as squares rather than points.
+    """
+
+    def __init__(self,
+                 in_channels,
+                 n_query_channels=128,
+                 patch_size=16,
+                 embedding_dim=128,
+                 num_heads=4,
+                 n_bins=100,
+                 min_val=0.1,
+                 max_val=10,
+                 conv_cfg=dict(type='Conv'),
+                 norm='linear',
+                 align_corners=False,
+                 threshold=0):
+        super().__init__()
+        self.out_channels = n_bins
+        self.align_corners = align_corners
+        self.norm = norm
+        self.num_classes = n_bins
+        self.min_val = min_val
+        self.max_val = max_val
+        self.n_query_channels = n_query_channels
+        self.patch_transformer = PatchTransformerEncoder(
+            in_channels, patch_size, embedding_dim, num_heads)
+        self.dot_product_layer = PixelWiseDotProduct()
+        self.threshold = threshold
+        self.conv3x3 = build_conv_layer(
+            conv_cfg,
+            in_channels,
+            embedding_dim,
+            kernel_size=3,
+            stride=1,
+            padding=1)
+        self.regressor = nn.Sequential(
+            nn.Linear(embedding_dim, 256), nn.LeakyReLU(), nn.Linear(256, 256),
+            nn.LeakyReLU(), nn.Linear(256, n_bins))
+        self.conv_out = nn.Sequential(
+            build_conv_layer(conv_cfg, in_channels, n_bins, kernel_size=1),
+            nn.Softmax(dim=1))
+
+    def forward(self, x):
+        # n, c, h, w = x.size()
+        tgt = self.patch_transformer(x.clone())  # .shape = S, N, E
+
+        x = self.conv3x3(x)
+
+        regression_head, queries = tgt[0,
+                                       ...], tgt[1:self.n_query_channels + 1,
+                                                 ...]
+
+        # Change from S, N, E to N, S, E
+        queries = queries.permute(1, 0, 2)
+        range_attention_maps = self.dot_product_layer(
+            x, queries)  # .shape = n, n_query_channels, h, w
+
+        y = self.regressor(regression_head)  # .shape = N, dim_out
+        if self.norm == 'linear':
+            y = torch.relu(y)
+            eps = 0.1
+            y = y + eps
+        elif self.norm == 'softmax':
+            return torch.softmax(y, dim=1), range_attention_maps
+        else:
+            y = torch.sigmoid(y)
+        bin_widths_normed = y / y.sum(dim=1, keepdim=True)
+        out = self.conv_out(range_attention_maps)
+
+        bin_widths = (self.max_val -
+                      self.min_val) * bin_widths_normed  # .shape = N, dim_out
+        bin_widths = F.pad(
+            bin_widths, (1, 0), mode='constant', value=self.min_val)
+        bin_edges = torch.cumsum(bin_widths, dim=1)
+
+        centers = 0.5 * (bin_edges[:, :-1] + bin_edges[:, 1:])
+        n, dim_out = centers.size()
+        centers = centers.view(n, dim_out, 1, 1)
+
+        pred = torch.sum(out * centers, dim=1, keepdim=True)
+        return bin_edges, pred
+
+    def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
+                test_cfg, **kwargs) -> Tensor:
+        """Forward function for testing, only ``pam_cam`` is used."""
+        pred = self.forward(inputs)[-1]
+        final = torch.clamp(pred, self.min_val, self.max_val)
+
+        final[torch.isinf(final)] = self.max_val
+        final[torch.isnan(final)] = self.min_val
+        return final
diff --git a/projects/CAT-Seg/README.md b/projects/CAT-Seg/README.md
new file mode 100644
index 0000000000..890e461ce4
--- /dev/null
+++ b/projects/CAT-Seg/README.md
@@ -0,0 +1,92 @@
+# CAT-Seg
+
+> [CAT-Seg: Cost Aggregation for Open-Vocabulary Semantic Segmentation](https://arxiv.org/abs/2303.11797)
+
+## Introduction
+
+<!-- [ALGORITHM] -->
+
+<a href="https://github.com/KU-CVLAB/CAT-Seg">Official Repo</a>
+
+<a href="https://github.com/SheffieldCao/mmsegmentation/blob/support-cat-seg/mmseg/models/necks/cat_aggregator.py">Code Snippet</a>
+
+## Abstract
+
+<!-- [ABSTRACT] -->
+
+Existing works on open-vocabulary semantic segmentation have utilized large-scale vision-language models, such as CLIP, to leverage their exceptional open-vocabulary recognition capabilities. However, the problem of transferring these capabilities learned from image-level supervision to the pixel-level task of segmentation and addressing arbitrary unseen categories at inference makes this task challenging. To address these issues, we aim to attentively relate objects within an image to given categories by leveraging relational information among class categories and visual semantics through aggregation, while also adapting the CLIP representations to the pixel-level task. However, we observe that direct optimization of the CLIP embeddings can harm its open-vocabulary capabilities. In this regard, we propose an alternative approach to optimize the imagetext similarity map, i.e. the cost map, using a novel cost aggregation-based method. Our framework, namely CATSeg, achieves state-of-the-art performance across all benchmarks. We provide extensive ablation studies to validate our choices. [Project page](https://ku-cvlab.github.io/CAT-Seg).
+
+<!-- [IMAGE] -->
+
+<div align=center >
+<img alt="CAT-Seg" src="https://github.com/open-mmlab/mmsegmentation/assets/49406546/d54674bb-52ae-4a20-a168-e25d041111e8"/>
+CAT-Seg model structure
+</div>
+
+## Usage
+
+CAT-Seg model training needs pretrained `CLIP` model. We have implemented `ViT-B` and `ViT-L` based `CLIP` model. To further use `ViT-bigG` or `ViT-H` ones, you need additional dependencies. Please install [open_clip](https://github.com/mlfoundations/open_clip) first. The pretrained `CLIP` model state dicts are loaded from [Huggingface-OpenCLIP](https://huggingface.co/models?library=open_clip). **If you come up with `ConnectionError` when downloading CLIP weights**, you can manually download them from the given repo and use `custom_clip_weights=/path/to/you/folder` of backbone in config file. Related tools are as shown in [requirements/optional.txt](requirements/optional.txt):
+
+```shell
+pip install ftfy==6.0.1
+pip install huggingface-hub
+pip install regex
+```
+
+In addition to the necessary [data preparation](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/en/user_guides/2_dataset_prepare.md), you also need class texts for clip text encoder. Please download the class text json file first [cls_texts](https://github.com/open-mmlab/mmsegmentation/files/11714914/cls_texts.zip) and arrange the folder as follows:
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── VOCdevkit
+│   │   ├── VOC2012
+│   │   ├── VOC2010
+│   │   ├── VOCaug
+│   ├── ade
+│   ├── coco_stuff164k
+│   ├── coco.json
+│   ├── pc59.json
+│   ├── pc459.json
+│   ├── ade150.json
+│   ├── ade847.json
+│   ├── voc20b.json
+│   ├── voc20.json
+```
+
+```shell
+# setup PYTHONPATH
+export PYTHONPATH=`pwd`:$PYTHONPATH
+# run evaluation
+mim test mmsegmentation ${CONFIG} --checkpoint ${CHECKPOINT} --launcher pytorch --gpus=8
+```
+
+## Results and models
+
+### ADE20K-150-ZeroShot
+
+| Method  | Backbone      | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | Device  | mIoU | mIoU(ms+flip) |                                                                                      config | download                                                                                                                                      |
+| ------- | ------------- | --------- | ------- | -------: | -------------- | ------- | ---- | ------------: | ------------------------------------------------------------------------------------------: | --------------------------------------------------------------------------------------------------------------------------------------------- |
+| CAT-Seg | R-101 & ViT-B | 384x384   | 80000   |        - | -              | RTX3090 | 27.2 |             - | [config](./configs/cat_seg/catseg_vitb-r101_4xb1-warmcoslr2e-4-adamw-80k_ade20k-384x384.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/cat_seg/catseg_vitb-r101_4xb1-warmcoslr2e-4-adamw-80k_ade20k-384x384-54194d72.pth) |
+
+Note:
+
+- All experiments of CAT-Seg are implemented with 4 RTX3090 GPUs, except the last one with pretrained ViT-bigG CLIP model (GPU Memory insufficient, you may need A100).
+- Due to the feature size bottleneck of the CLIP image encoder, the inference and testing can only be done under `slide` mode, the inference time is longer since the test size is much more bigger that training size of `(384, 384)`.
+- The ResNet backbones utilized in CAT-Seg models are standard `ResNet` rather than `ResNetV1c`.
+- The zero-shot segmentation results on PASCAL VOC and ADE20K are from the original paper. Our results are coming soon. We appreatiate your contribution!
+- In additional to zero-shot segmentation performance results, we also provided the evaluation results on the `val2017` set of **COCO-stuff164k** for reference, which is the training dataset of CAT-Seg. The testing was done **without TTA**.
+- The number behind the dataset name is the category number for segmentation evaluation (except training data **COCO-stuff 164k**). **PASCAL VOC-20b** defines the "background" as classes present in **PASCAL-Context-59** but not in **PASCAL VOC-20**.
+
+## Citation
+
+```bibtex
+@inproceedings{cheng2021mask2former,
+  title={CAT-Seg: Cost Aggregation for Open-Vocabulary Semantic Segmentation},
+  author={Seokju Cho and Heeseong Shin and Sunghwan Hong and Seungjun An and Seungjun Lee and Anurag Arnab and Paul Hongsuck Seo and Seungryong Kim},
+  journal={CVPR},
+  year={2023}
+}
+```
diff --git a/projects/CAT-Seg/cat_seg/__init__.py b/projects/CAT-Seg/cat_seg/__init__.py
new file mode 100644
index 0000000000..2c51fbaa2e
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/__init__.py
@@ -0,0 +1,2 @@
+from .models import *  # noqa: F401,F403
+from .utils import *  # noqa: F401,F403
diff --git a/projects/CAT-Seg/cat_seg/models/__init__.py b/projects/CAT-Seg/cat_seg/models/__init__.py
new file mode 100644
index 0000000000..cd0e15d3ec
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/models/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .cat_aggregator import (AggregatorLayer, CATSegAggregator,
+                             ClassAggregateLayer, SpatialAggregateLayer)
+from .cat_head import CATSegHead
+from .clip_ovseg import CLIPOVCATSeg
+
+__all__ = [
+    'AggregatorLayer', 'CATSegAggregator', 'ClassAggregateLayer',
+    'SpatialAggregateLayer', 'CATSegHead', 'CLIPOVCATSeg'
+]
diff --git a/projects/CAT-Seg/cat_seg/models/cat_aggregator.py b/projects/CAT-Seg/cat_seg/models/cat_aggregator.py
new file mode 100644
index 0000000000..a0483fe505
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/models/cat_aggregator.py
@@ -0,0 +1,763 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import build_norm_layer
+from mmcv.cnn.bricks.transformer import FFN, build_dropout
+from mmengine.model import BaseModule
+from mmengine.utils import to_2tuple
+
+from mmseg.registry import MODELS
+from ..utils import FullAttention, LinearAttention
+
+
+class AGWindowMSA(BaseModule):
+    """Appearance Guidance Window based multi-head self-attention (W-MSA)
+    module with relative position bias.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        appearance_dims (int): Number of appearance guidance feature channels.
+        num_heads (int): Number of attention heads.
+        window_size (tuple[int]): The height and width of the window.
+        qkv_bias (bool, optional):  If True, add a learnable bias to q, k, v.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Default: 0.0
+        proj_drop_rate (float, optional): Dropout ratio of output. Default: 0.
+        init_cfg (dict | None, optional): The Config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 appearance_dims,
+                 num_heads,
+                 window_size,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0.,
+                 proj_drop_rate=0.,
+                 init_cfg=None):
+
+        super().__init__(init_cfg=init_cfg)
+        self.embed_dims = embed_dims
+        self.appearance_dims = appearance_dims
+        self.window_size = window_size  # Wh, Ww
+        self.num_heads = num_heads
+        head_embed_dims = embed_dims // num_heads
+        self.scale = qk_scale or head_embed_dims**-0.5
+
+        # About 2x faster than original impl
+        Wh, Ww = self.window_size
+        rel_index_coords = self.double_step_seq(2 * Ww - 1, Wh, 1, Ww)
+        rel_position_index = rel_index_coords + rel_index_coords.T
+        rel_position_index = rel_position_index.flip(1).contiguous()
+        self.register_buffer('relative_position_index', rel_position_index)
+
+        self.qk = nn.Linear(
+            embed_dims + appearance_dims, embed_dims * 2, bias=qkv_bias)
+        self.v = nn.Linear(embed_dims, embed_dims, bias=qkv_bias)
+        self.attn_drop = nn.Dropout(attn_drop_rate)
+        self.proj = nn.Linear(embed_dims, embed_dims)
+        self.proj_drop = nn.Dropout(proj_drop_rate)
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def forward(self, x, mask=None):
+        """
+        Args:
+            x (tensor): input features with shape of (num_windows*B, N, C),
+                C = embed_dims + appearance_dims.
+            mask (tensor | None, Optional): mask with shape of (num_windows,
+                Wh*Ww, Wh*Ww), value should be between (-inf, 0].
+        """
+        B, N, _ = x.shape
+        qk = self.qk(x).reshape(B, N, 2, self.num_heads,
+                                self.embed_dims // self.num_heads).permute(
+                                    2, 0, 3, 1,
+                                    4)  # 2 B NUM_HEADS N embed_dims//NUM_HEADS
+        v = self.v(x[:, :, :self.embed_dims]).reshape(
+            B, N, self.num_heads, self.embed_dims // self.num_heads).permute(
+                0, 2, 1, 3)  # B NUM_HEADS N embed_dims//NUM_HEADS
+        # make torchscript happy (cannot use tensor as tuple)
+        q, k = qk[0], qk[1]
+
+        q = q * self.scale
+        attn = (q @ k.transpose(-2, -1))
+
+        if mask is not None:
+            nW = mask.shape[0]
+            attn = attn.view(B // nW, nW, self.num_heads, N,
+                             N) + mask.unsqueeze(1).unsqueeze(0)
+            attn = attn.view(-1, self.num_heads, N, N)
+        attn = self.softmax(attn)
+
+        attn = self.attn_drop(attn)
+
+        x = (attn @ v).transpose(1, 2).reshape(B, N, self.embed_dims)
+        x = self.proj(x)
+        x = self.proj_drop(x)
+        return x
+
+    @staticmethod
+    def double_step_seq(step1, len1, step2, len2):
+        """Double step sequence."""
+        seq1 = torch.arange(0, step1 * len1, step1)
+        seq2 = torch.arange(0, step2 * len2, step2)
+        return (seq1[:, None] + seq2[None, :]).reshape(1, -1)
+
+
+class AGShiftWindowMSA(BaseModule):
+    """Appearance Guidance Shifted Window Multihead Self-Attention Module.
+
+    Args:
+        embed_dims (int): Number of input channels.
+        appearance_dims (int): Number of appearance guidance channels
+        num_heads (int): Number of attention heads.
+        window_size (int): The height and width of the window.
+        shift_size (int, optional): The shift step of each window towards
+            right-bottom. If zero, act as regular window-msa. Defaults to 0.
+        qkv_bias (bool, optional): If True, add a learnable bias to q, k, v.
+            Default: True
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Defaults: None.
+        attn_drop_rate (float, optional): Dropout ratio of attention weight.
+            Defaults: 0.
+        proj_drop_rate (float, optional): Dropout ratio of output.
+            Defaults: 0.
+        dropout_layer (dict, optional): The dropout_layer used before output.
+            Defaults: dict(type='DropPath', drop_prob=0.).
+        init_cfg (dict, optional): The extra config for initialization.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 appearance_dims,
+                 num_heads,
+                 window_size,
+                 shift_size=0,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 attn_drop_rate=0,
+                 proj_drop_rate=0,
+                 dropout_layer=dict(type='DropPath', drop_prob=0.),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+
+        self.window_size = window_size
+        self.shift_size = shift_size
+        assert 0 <= self.shift_size < self.window_size
+
+        self.w_msa = AGWindowMSA(
+            embed_dims=embed_dims,
+            appearance_dims=appearance_dims,
+            num_heads=num_heads,
+            window_size=to_2tuple(window_size),
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=proj_drop_rate,
+            init_cfg=None)
+
+        self.drop = build_dropout(dropout_layer)
+
+    def forward(self, query, hw_shape):
+        """
+        Args:
+            query: The input query.
+            hw_shape: The shape of the feature height and width.
+        """
+        B, L, C = query.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+        query = query.view(B, H, W, C)
+
+        # pad feature maps to multiples of window size
+        pad_r = (self.window_size - W % self.window_size) % self.window_size
+        pad_b = (self.window_size - H % self.window_size) % self.window_size
+        query = F.pad(query, (0, 0, 0, pad_r, 0, pad_b))
+        H_pad, W_pad = query.shape[1], query.shape[2]
+
+        # cyclic shift
+        if self.shift_size > 0:
+            shifted_query = torch.roll(
+                query,
+                shifts=(-self.shift_size, -self.shift_size),
+                dims=(1, 2))
+
+            # calculate attention mask for SW-MSA
+            img_mask = torch.zeros((1, H_pad, W_pad, 1), device=query.device)
+            h_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            w_slices = (slice(0, -self.window_size),
+                        slice(-self.window_size,
+                              -self.shift_size), slice(-self.shift_size, None))
+            cnt = 0
+            for h in h_slices:
+                for w in w_slices:
+                    img_mask[:, h, w, :] = cnt
+                    cnt += 1
+
+            # nW, window_size, window_size, 1
+            mask_windows = self.window_partition(img_mask)
+            mask_windows = mask_windows.view(
+                -1, self.window_size * self.window_size)
+            attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2)
+            attn_mask = attn_mask.masked_fill(attn_mask != 0,
+                                              float(-100.0)).masked_fill(
+                                                  attn_mask == 0, float(0.0))
+        else:
+            shifted_query = query
+            attn_mask = None
+
+        # nW*B, window_size, window_size, C
+        query_windows = self.window_partition(shifted_query)
+        # nW*B, window_size*window_size, C
+        query_windows = query_windows.view(-1, self.window_size**2, C)
+
+        # W-MSA/SW-MSA (nW*B, window_size*window_size, C)
+        attn_windows = self.w_msa(query_windows, mask=attn_mask)
+
+        # merge windows
+        attn_windows = attn_windows.view(-1, self.window_size,
+                                         self.window_size,
+                                         self.w_msa.embed_dims)
+
+        # B H' W' self.w_msa.embed_dims
+        shifted_x = self.window_reverse(attn_windows, H_pad, W_pad)
+        # reverse cyclic shift
+        if self.shift_size > 0:
+            x = torch.roll(
+                shifted_x,
+                shifts=(self.shift_size, self.shift_size),
+                dims=(1, 2))
+        else:
+            x = shifted_x
+
+        if pad_r > 0 or pad_b:
+            x = x[:, :H, :W, :].contiguous()
+
+        x = x.view(B, H * W, self.w_msa.embed_dims)
+
+        x = self.drop(x)
+        return x
+
+    def window_reverse(self, windows, H, W):
+        """
+        Args:
+            windows: (num_windows*B, window_size, window_size, C)
+            H (int): Height of image
+            W (int): Width of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        window_size = self.window_size
+        B = int(windows.shape[0] / (H * W / window_size / window_size))
+        x = windows.view(B, H // window_size, W // window_size, window_size,
+                         window_size, -1)
+        x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1)
+        return x
+
+    def window_partition(self, x):
+        """
+        Args:
+            x: (B, H, W, C)
+        Returns:
+            windows: (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = x.shape
+        window_size = self.window_size
+        x = x.view(B, H // window_size, window_size, W // window_size,
+                   window_size, C)
+        windows = x.permute(0, 1, 3, 2, 4, 5).contiguous()
+        windows = windows.view(-1, window_size, window_size, C)
+        return windows
+
+
+class AGSwinBlock(BaseModule):
+    """Appearance Guidance Swin Transformer Block.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        appearance_dims (int): The appearance guidance dimension.
+        num_heads (int): Parallel attention heads.
+        mlp_ratios (int): The hidden dimension ratio w.r.t. embed_dims
+            for FFNs.
+        window_size (int, optional): The local window scale.
+            Default: 7.
+        shift (bool, optional): whether to shift window or not.
+            Default False.
+        qkv_bias (bool, optional): enable bias for qkv if True.
+            Default: True.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        drop_rate (float, optional): Dropout rate. Default: 0.
+        attn_drop_rate (float, optional): Attention dropout rate.
+            Default: 0.
+        drop_path_rate (float, optional): Stochastic depth rate.
+            Default: 0.
+        act_cfg (dict, optional): The config dict of activation function.
+            Default: dict(type='GELU').
+        norm_cfg (dict, optional): The config dict of normalization.
+            Default: dict(type='LN').
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 appearance_dims,
+                 num_heads,
+                 mlp_ratios=4,
+                 window_size=7,
+                 shift=False,
+                 qkv_bias=True,
+                 qk_scale=None,
+                 drop_rate=0.,
+                 attn_drop_rate=0.,
+                 drop_path_rate=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='LN'),
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.norm1 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.attn = AGShiftWindowMSA(
+            embed_dims=embed_dims,
+            appearance_dims=appearance_dims,
+            num_heads=num_heads,
+            window_size=window_size,
+            shift_size=window_size // 2 if shift else 0,
+            qkv_bias=qkv_bias,
+            qk_scale=qk_scale,
+            attn_drop_rate=attn_drop_rate,
+            proj_drop_rate=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            init_cfg=None)
+
+        self.norm2 = build_norm_layer(norm_cfg, embed_dims)[1]
+        self.ffn = FFN(
+            embed_dims=embed_dims,
+            feedforward_channels=embed_dims * mlp_ratios,
+            num_fcs=2,
+            ffn_drop=drop_rate,
+            dropout_layer=dict(type='DropPath', drop_prob=drop_path_rate),
+            act_cfg=act_cfg,
+            add_identity=True,
+            init_cfg=None)
+
+    def forward(self, inputs, hw_shape):
+        """
+        Args:
+            inputs (list[Tensor]): appearance_guidance (B, H, W, C);
+                x (B, L, C)
+            hw_shape (tuple[int]): shape of feature.
+        """
+        x, appearance_guidance = inputs
+        B, L, C = x.shape
+        H, W = hw_shape
+        assert L == H * W, 'input feature has wrong size'
+
+        identity = x
+        x = self.norm1(x)
+
+        # appearance guidance
+        x = x.view(B, H, W, C)
+        if appearance_guidance is not None:
+            x = torch.cat([x, appearance_guidance], dim=-1).flatten(1, 2)
+
+        x = self.attn(x, hw_shape)
+
+        x = x + identity
+
+        identity = x
+        x = self.norm2(x)
+        x = self.ffn(x, identity=identity)
+
+        return x
+
+
+@MODELS.register_module()
+class SpatialAggregateLayer(BaseModule):
+    """Spatial aggregation layer of CAT-Seg.
+
+    Args:
+        embed_dims (int): The feature dimension.
+        appearance_dims (int): The appearance guidance dimension.
+        num_heads (int): Parallel attention heads.
+        mlp_ratios (int): The hidden dimension ratio w.r.t. embed_dims
+            for FFNs.
+        window_size (int, optional): The local window scale. Default: 7.
+        qk_scale (float | None, optional): Override default qk scale of
+            head_dim ** -0.5 if set. Default: None.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(self,
+                 embed_dims,
+                 appearance_dims,
+                 num_heads,
+                 mlp_ratios,
+                 window_size=7,
+                 qk_scale=None,
+                 init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        self.block_1 = AGSwinBlock(
+            embed_dims,
+            appearance_dims,
+            num_heads,
+            mlp_ratios,
+            window_size=window_size,
+            shift=False,
+            qk_scale=qk_scale)
+        self.block_2 = AGSwinBlock(
+            embed_dims,
+            appearance_dims,
+            num_heads,
+            mlp_ratios,
+            window_size=window_size,
+            shift=True,
+            qk_scale=qk_scale)
+        self.guidance_norm = nn.LayerNorm(
+            appearance_dims) if appearance_dims > 0 else None
+
+    def forward(self, x, appearance_guidance):
+        """
+        Args:
+            x (torch.Tensor): B C T H W.
+            appearance_guidance (torch.Tensor): B C H W.
+        """
+        B, C, T, H, W = x.shape
+        x = x.permute(0, 2, 3, 4, 1).flatten(0, 1).flatten(1, 2)  # BT, HW, C
+        if appearance_guidance is not None:
+            appearance_guidance = appearance_guidance.repeat(
+                T, 1, 1, 1).permute(0, 2, 3, 1)  # BT, HW, C
+            appearance_guidance = self.guidance_norm(appearance_guidance)
+        else:
+            assert self.appearance_dims == 0
+        x = self.block_1((x, appearance_guidance), (H, W))
+        x = self.block_2((x, appearance_guidance), (H, W))
+        x = x.transpose(1, 2).reshape(B, T, C, -1)
+        x = x.transpose(1, 2).reshape(B, C, T, H, W)
+        return x
+
+
+class AttentionLayer(nn.Module):
+    """Attention layer for ClassAggregration of CAT-Seg.
+
+    Source: https://github.com/KU-CVLAB/CAT-Seg/blob/main/cat_seg/modeling/transformer/model.py#L310 # noqa
+    """
+
+    def __init__(self,
+                 hidden_dim,
+                 guidance_dim,
+                 nheads=8,
+                 attention_type='linear'):
+        super().__init__()
+        self.nheads = nheads
+        self.q = nn.Linear(hidden_dim + guidance_dim, hidden_dim)
+        self.k = nn.Linear(hidden_dim + guidance_dim, hidden_dim)
+        self.v = nn.Linear(hidden_dim, hidden_dim)
+
+        if attention_type == 'linear':
+            self.attention = LinearAttention()
+        elif attention_type == 'full':
+            self.attention = FullAttention()
+        else:
+            raise NotImplementedError
+
+    def forward(self, x, guidance=None):
+        """
+        Args:
+            x: B*H_p*W_p, T, C
+            guidance: B*H_p*W_p, T, C
+        """
+        B, L, _ = x.shape
+        q = self.q(torch.cat([x, guidance],
+                             dim=-1)) if guidance is not None else self.q(x)
+        k = self.k(torch.cat([x, guidance],
+                             dim=-1)) if guidance is not None else self.k(x)
+        v = self.v(x)
+
+        q = q.reshape(B, L, self.nheads, -1)
+        k = k.reshape(B, L, self.nheads, -1)
+        v = v.reshape(B, L, self.nheads, -1)
+
+        out = self.attention(q, k, v)
+        out = out.reshape(B, L, -1)
+        return out
+
+
+@MODELS.register_module()
+class ClassAggregateLayer(BaseModule):
+    """Class aggregation layer of CAT-Seg.
+
+    Args:
+        hidden_dims (int): The feature dimension.
+        guidance_dims (int): The appearance guidance dimension.
+        num_heads (int): Parallel attention heads.
+        attention_type (str): Type of attention layer. Default: 'linear'.
+        pooling_size (tuple[int] | list[int]): Pooling size.
+        init_cfg (dict | list | None, optional): The init config.
+            Default: None.
+    """
+
+    def __init__(
+            self,
+            hidden_dims=64,
+            guidance_dims=64,
+            num_heads=8,
+            attention_type='linear',
+            pooling_size=(4, 4),
+            init_cfg=None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        self.pool = nn.AvgPool2d(pooling_size)
+        self.attention = AttentionLayer(
+            hidden_dims,
+            guidance_dims,
+            nheads=num_heads,
+            attention_type=attention_type)
+        self.MLP = FFN(
+            embed_dims=hidden_dims,
+            feedforward_channels=hidden_dims * 4,
+            num_fcs=2)
+        self.norm1 = nn.LayerNorm(hidden_dims)
+        self.norm2 = nn.LayerNorm(hidden_dims)
+
+    def pool_features(self, x):
+        """Intermediate pooling layer for computational efficiency.
+
+        Args:
+            x: B, C, T, H, W
+        """
+        B, C, T, H, W = x.shape
+        x = x.transpose(1, 2).reshape(-1, C, H, W)
+        x = self.pool(x)
+        *_, H_, W_ = x.shape
+        x = x.reshape(B, T, C, H_, W_).transpose(1, 2)
+        return x
+
+    def forward(self, x, guidance):
+        """
+        Args:
+            x: B, C, T, H, W
+            guidance: B, T, C
+        """
+        B, C, T, H, W = x.size()
+        x_pool = self.pool_features(x)
+        *_, H_pool, W_pool = x_pool.size()
+
+        x_pool = x_pool.permute(0, 3, 4, 2, 1).reshape(-1, T, C)
+        # B*H_p*W_p T C
+        if guidance is not None:
+            guidance = guidance.repeat(H_pool * W_pool, 1, 1)
+
+        x_pool = x_pool + self.attention(self.norm1(x_pool),
+                                         guidance)  # Attention
+        x_pool = x_pool + self.MLP(self.norm2(x_pool))  # MLP
+
+        x_pool = x_pool.reshape(B, H_pool * W_pool, T,
+                                C).permute(0, 2, 3, 1).reshape(
+                                    B, T, C, H_pool,
+                                    W_pool).flatten(0, 1)  # BT C H_p W_p
+        x_pool = F.interpolate(
+            x_pool, size=(H, W), mode='bilinear', align_corners=True)
+        x_pool = x_pool.reshape(B, T, C, H, W).transpose(1, 2)  # B C T H W
+        x = x + x_pool  # Residual
+
+        return x
+
+
+@MODELS.register_module()
+class AggregatorLayer(BaseModule):
+    """Single Aggregator Layer of CAT-Seg."""
+
+    def __init__(self,
+                 embed_dims=64,
+                 text_guidance_dims=512,
+                 appearance_guidance_dims=512,
+                 num_heads=4,
+                 mlp_ratios=4,
+                 window_size=7,
+                 attention_type='linear',
+                 pooling_size=(2, 2),
+                 init_cfg=None) -> None:
+        super().__init__(init_cfg=init_cfg)
+        self.spatial_agg = SpatialAggregateLayer(
+            embed_dims,
+            appearance_guidance_dims,
+            num_heads=num_heads,
+            mlp_ratios=mlp_ratios,
+            window_size=window_size)
+        self.class_agg = ClassAggregateLayer(
+            embed_dims,
+            text_guidance_dims,
+            num_heads=num_heads,
+            attention_type=attention_type,
+            pooling_size=pooling_size)
+
+    def forward(self, x, appearance_guidance, text_guidance):
+        """
+        Args:
+            x: B C T H W
+        """
+        x = self.spatial_agg(x, appearance_guidance)
+        x = self.class_agg(x, text_guidance)
+        return x
+
+
+@MODELS.register_module()
+class CATSegAggregator(BaseModule):
+    """CATSeg Aggregator.
+
+    This Aggregator is the mmseg implementation of
+    `CAT-Seg <https://arxiv.org/abs/2303.11797>`_.
+
+    Args:
+        text_guidance_dim (int): Text guidance dimensions. Default: 512.
+        text_guidance_proj_dim (int): Text guidance projection dimensions.
+            Default: 128.
+        appearance_guidance_dim (int): Appearance guidance dimensions.
+            Default: 512.
+        appearance_guidance_proj_dim (int): Appearance guidance projection
+            dimensions. Default: 128.
+        num_layers (int): Aggregator layer number. Default: 4.
+        num_heads (int): Attention layer head number. Default: 4.
+        embed_dims (int): Input feature dimensions. Default: 128.
+        pooling_size (tuple | list): Pooling size of the class aggregator
+            layer. Default: (6, 6).
+        mlp_ratios (int): The hidden dimension ratio w.r.t. input dimension.
+            Default: 4.
+        window_size (int): Swin block window size. Default:12.
+        attention_type (str): Attention type of class aggregator layer.
+            Default:'linear'.
+        prompt_channel (int): Prompt channels. Default: 80.
+    """
+
+    def __init__(self,
+                 text_guidance_dim=512,
+                 text_guidance_proj_dim=128,
+                 appearance_guidance_dim=512,
+                 appearance_guidance_proj_dim=128,
+                 num_layers=4,
+                 num_heads=4,
+                 embed_dims=128,
+                 pooling_size=(6, 6),
+                 mlp_ratios=4,
+                 window_size=12,
+                 attention_type='linear',
+                 prompt_channel=80,
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.num_layers = num_layers
+        self.embed_dims = embed_dims
+
+        self.layers = nn.ModuleList([
+            AggregatorLayer(
+                embed_dims=embed_dims,
+                text_guidance_dims=text_guidance_proj_dim,
+                appearance_guidance_dims=appearance_guidance_proj_dim,
+                num_heads=num_heads,
+                mlp_ratios=mlp_ratios,
+                window_size=window_size,
+                attention_type=attention_type,
+                pooling_size=pooling_size) for _ in range(num_layers)
+        ])
+
+        self.conv1 = nn.Conv2d(
+            prompt_channel, embed_dims, kernel_size=7, stride=1, padding=3)
+
+        self.guidance_projection = nn.Sequential(
+            nn.Conv2d(
+                appearance_guidance_dim,
+                appearance_guidance_proj_dim,
+                kernel_size=3,
+                stride=1,
+                padding=1),
+            nn.ReLU(),
+        ) if appearance_guidance_dim > 0 else None
+
+        self.text_guidance_projection = nn.Sequential(
+            nn.Linear(text_guidance_dim, text_guidance_proj_dim),
+            nn.ReLU(),
+        ) if text_guidance_dim > 0 else None
+
+    def feature_map(self, img_feats, text_feats):
+        """Concatenation type cost volume.
+
+        For ablation study of cost volume type.
+        """
+        img_feats = F.normalize(img_feats, dim=1)  # B C H W
+        img_feats = img_feats.unsqueeze(2).repeat(1, 1, text_feats.shape[1], 1,
+                                                  1)
+        text_feats = F.normalize(text_feats, dim=-1)  # B T P C
+        text_feats = text_feats.mean(dim=-2)
+        text_feats = F.normalize(text_feats, dim=-1)  # B T C
+        text_feats = text_feats.unsqueeze(-1).unsqueeze(-1).repeat(
+            1, 1, 1, img_feats.shape[-2], img_feats.shape[-1]).transpose(1, 2)
+        return torch.cat((img_feats, text_feats), dim=1)  # B 2C T H W
+
+    def correlation(self, img_feats, text_feats):
+        """Correlation of image features and text features."""
+        img_feats = F.normalize(img_feats, dim=1)  # B C H W
+        text_feats = F.normalize(text_feats, dim=-1)  # B T P C
+        corr = torch.einsum('bchw, btpc -> bpthw', img_feats, text_feats)
+        return corr
+
+    def corr_embed(self, x):
+        """Correlation embeddings encoding."""
+        B = x.shape[0]
+        corr_embed = x.permute(0, 2, 1, 3, 4).flatten(0, 1)
+        corr_embed = self.conv1(corr_embed)
+        corr_embed = corr_embed.reshape(B, -1, self.embed_dims, x.shape[-2],
+                                        x.shape[-1]).transpose(1, 2)
+        return corr_embed
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs (dict): including the following keys,
+                'appearance_feat': list[torch.Tensor], w.r.t. out_indices of
+                    `self.feature_extractor`.
+                'clip_text_feat': the text feature extracted by clip text
+                    encoder.
+                'clip_text_feat_test': the text feature extracted by clip text
+                    encoder for testing.
+                'clip_img_feat': the image feature extracted clip image
+                    encoder.
+        """
+        img_feats = inputs['clip_img_feat']
+        B = img_feats.size(0)
+        appearance_guidance = inputs[
+            'appearance_feat'][::-1]  # order (out_indices) 2, 1, 0
+        text_feats = inputs['clip_text_feat'] if self.training else inputs[
+            'clip_text_feat_test']
+        text_feats = text_feats.repeat(B, 1, 1, 1)
+
+        corr = self.correlation(img_feats, text_feats)
+        # corr = self.feature_map(img_feats, text_feats)
+        corr_embed = self.corr_embed(corr)
+
+        projected_guidance, projected_text_guidance = None, None
+
+        if self.guidance_projection is not None:
+            projected_guidance = self.guidance_projection(
+                appearance_guidance[0])
+
+        if self.text_guidance_projection is not None:
+            text_feats = text_feats.mean(dim=-2)
+            text_feats = text_feats / text_feats.norm(dim=-1, keepdim=True)
+            projected_text_guidance = self.text_guidance_projection(text_feats)
+
+        for layer in self.layers:
+            corr_embed = layer(corr_embed, projected_guidance,
+                               projected_text_guidance)
+
+        return dict(
+            corr_embed=corr_embed, appearance_feats=appearance_guidance[1:])
diff --git a/projects/CAT-Seg/cat_seg/models/cat_head.py b/projects/CAT-Seg/cat_seg/models/cat_head.py
new file mode 100644
index 0000000000..36bb1c5617
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/models/cat_head.py
@@ -0,0 +1,116 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.registry import MODELS
+
+
+class UpBlock(nn.Module):
+    """Upsample Block with two consecutive convolution layers."""
+
+    def __init__(self, in_channels, out_channels, guidance_channels):
+        super().__init__()
+        self.up = nn.ConvTranspose2d(
+            in_channels,
+            in_channels - guidance_channels,
+            kernel_size=2,
+            stride=2)
+        self.conv1 = ConvModule(
+            in_channels,
+            out_channels,
+            3,
+            padding=1,
+            bias=False,
+            norm_cfg=dict(type='GN', num_groups=out_channels // 16))
+        self.conv2 = ConvModule(
+            out_channels,
+            out_channels,
+            3,
+            padding=1,
+            bias=False,
+            norm_cfg=dict(type='GN', num_groups=out_channels // 16))
+
+    def forward(self, x, guidance=None):
+        """Forward function with visual guidance."""
+        x = self.up(x)
+        if guidance is not None:
+            T = x.size(0) // guidance.size(0)
+            # guidance = repeat(guidance, "B C H W -> (B T) C H W", T=T)
+            guidance = guidance.repeat(T, 1, 1, 1)
+            x = torch.cat([x, guidance], dim=1)
+        x = self.conv1(x)
+
+        return self.conv2(x)
+
+
+@MODELS.register_module()
+class CATSegHead(BaseDecodeHead):
+    """CATSeg Head.
+
+    This segmentation head is the mmseg implementation of
+    `CAT-Seg <https://arxiv.org/abs/2303.11797>`_.
+
+    Args:
+        embed_dims (int): The number of input dimensions.
+        decoder_dims (list): The number of decoder dimensions.
+        decoder_guidance_proj_dims (list): The number of appearance
+            guidance dimensions.
+        init_cfg
+    """
+
+    def __init__(self,
+                 embed_dims=128,
+                 decoder_dims=(64, 32),
+                 decoder_guidance_dims=(256, 128),
+                 decoder_guidance_proj_dims=(32, 16),
+                 **kwargs):
+        super().__init__(**kwargs)
+        self.decoder_guidance_projection = nn.ModuleList([
+            nn.Sequential(
+                nn.Conv2d(
+                    dec_dims,
+                    dec_dims_proj,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1),
+                nn.ReLU(),
+            ) for dec_dims, dec_dims_proj in zip(decoder_guidance_dims,
+                                                 decoder_guidance_proj_dims)
+        ]) if decoder_guidance_dims[0] > 0 else None
+
+        self.decoder1 = UpBlock(embed_dims, decoder_dims[0],
+                                decoder_guidance_proj_dims[0])
+        self.decoder2 = UpBlock(decoder_dims[0], decoder_dims[1],
+                                decoder_guidance_proj_dims[1])
+        self.conv_seg = nn.Conv2d(
+            decoder_dims[1], 1, kernel_size=3, stride=1, padding=1)
+
+    def forward(self, inputs):
+        """Forward function.
+
+        Args:
+            inputs (dict): Input features including the following features,
+                corr_embed: aggregated correlation embeddings.
+                appearance_feats: decoder appearance feature guidance.
+        """
+        # decoder guidance projection
+        if self.decoder_guidance_projection is not None:
+            projected_decoder_guidance = [
+                proj(g) for proj, g in zip(self.decoder_guidance_projection,
+                                           inputs['appearance_feats'])
+            ]
+
+        # decoder layers
+        B = inputs['corr_embed'].size(0)
+        corr_embed = inputs['corr_embed'].transpose(1, 2).flatten(0, 1)
+        corr_embed = self.decoder1(corr_embed, projected_decoder_guidance[0])
+        corr_embed = self.decoder2(corr_embed, projected_decoder_guidance[1])
+
+        output = self.cls_seg(corr_embed)
+
+        # rearrange the output to (B, T, H, W)
+        H_ori, W_ori = output.shape[-2:]
+        output = output.reshape(B, -1, H_ori, W_ori)
+        return output
diff --git a/projects/CAT-Seg/cat_seg/models/clip_ovseg.py b/projects/CAT-Seg/cat_seg/models/clip_ovseg.py
new file mode 100644
index 0000000000..cb67744e34
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/models/clip_ovseg.py
@@ -0,0 +1,293 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import json
+import os
+from typing import List
+
+import torch
+import torch.nn.functional as F
+from huggingface_hub.utils._errors import LocalEntryNotFoundError
+from mmengine.model import BaseModule
+
+from mmseg.registry import MODELS
+from mmseg.utils import ConfigType
+from ..utils import clip_wrapper
+from ..utils.clip_templates import (IMAGENET_TEMPLATES,
+                                    IMAGENET_TEMPLATES_SELECT)
+
+
+@MODELS.register_module()
+class CLIPOVCATSeg(BaseModule):
+    """CLIP based Open Vocabulary CAT-Seg model backbone.
+
+    This backbone is the modified implementation of `CAT-Seg Backbone
+    <https://arxiv.org/abs/2303.11797>`_. It combines the CLIP model and
+    another feature extractor, a.k.a the appearance guidance extractor
+    in the original `CAT-Seg`.
+
+    Args:
+        feature_extractor (ConfigType): Appearance guidance extractor
+            config dict.
+        train_class_json (str): The training class json file.
+        test_class_json (str): The path to test class json file.
+        clip_pretrained (str): The pre-trained clip type.
+        clip_finetune (str): The finetuning settings of clip model.
+        custom_clip_weights (str): The custmized clip weights directory. When
+            encountering huggingface model download errors, you can manually
+            download the pretrained weights.
+        backbone_multiplier (float): The learning rate multiplier.
+            Default: 0.01.
+        prompt_depth (int): The prompt depth. Default: 0.
+        prompt_length (int): The prompt length. Default: 0.
+        prompt_ensemble_type (str): The prompt ensemble type.
+            Default: "imagenet".
+        pixel_mean (List[float]): The pixel mean for feature extractor.
+        pxiel_std (List[float]): The pixel std for feature extractor.
+        clip_pixel_mean (List[float]): The pixel mean for clip model.
+        clip_pxiel_std (List[float]): The pixel std for clip model.
+        clip_img_feat_size: (List[int]: Clip image embedding size from
+            image encoder.
+        init_cfg (dict or list[dict], optional): Initialization config dict.
+            Default: None.
+    """
+
+    def __init__(
+            self,
+            feature_extractor: ConfigType,
+            train_class_json: str,
+            test_class_json: str,
+            clip_pretrained: str,
+            clip_finetune: str,
+            custom_clip_weights: str = None,
+            backbone_multiplier=0.01,
+            prompt_depth: int = 0,
+            prompt_length: int = 0,
+            prompt_ensemble_type: str = 'imagenet',
+            pixel_mean: List[float] = [123.675, 116.280, 103.530],
+            pixel_std: List[float] = [58.395, 57.120, 57.375],
+            clip_pixel_mean: List[float] = [
+                122.7709383, 116.7460125, 104.09373615
+            ],
+            clip_pixel_std: List[float] = [68.5005327, 66.6321579, 70.3231630],
+            clip_img_feat_size: List[int] = [24, 24],
+            init_cfg=None):
+        super().__init__(init_cfg=init_cfg)
+        # normalization parameters
+        self.register_buffer('pixel_mean',
+                             torch.Tensor(pixel_mean).view(1, -1, 1, 1), False)
+        self.register_buffer('pixel_std',
+                             torch.Tensor(pixel_std).view(1, -1, 1, 1), False)
+        self.register_buffer('clip_pixel_mean',
+                             torch.Tensor(clip_pixel_mean).view(1, -1, 1, 1),
+                             False)
+        self.register_buffer('clip_pixel_std',
+                             torch.Tensor(clip_pixel_std).view(1, -1, 1, 1),
+                             False)
+        self.clip_resolution = (
+            384, 384) if clip_pretrained == 'ViT-B/16' else (336, 336)
+        # modified clip image encoder with fixed size dense output
+        self.clip_img_feat_size = clip_img_feat_size
+
+        # prepare clip templates
+        self.prompt_ensemble_type = prompt_ensemble_type
+        if self.prompt_ensemble_type == 'imagenet_select':
+            prompt_templates = IMAGENET_TEMPLATES_SELECT
+        elif self.prompt_ensemble_type == 'imagenet':
+            prompt_templates = IMAGENET_TEMPLATES
+        elif self.prompt_ensemble_type == 'single':
+            prompt_templates = [
+                'A photo of a {} in the scene',
+            ]
+        else:
+            raise NotImplementedError
+        self.prompt_templates = prompt_templates
+
+        # build the feature extractor
+        self.feature_extractor = MODELS.build(feature_extractor)
+
+        # build CLIP model
+        with open(train_class_json) as f_in:
+            self.class_texts = json.load(f_in)
+        with open(test_class_json) as f_in:
+            self.test_class_texts = json.load(f_in)
+        assert self.class_texts is not None
+        if self.test_class_texts is None:
+            self.test_class_texts = self.class_texts
+        device = 'cuda' if torch.cuda.is_available() else 'cpu'
+        self.tokenizer = None
+        if clip_pretrained == 'ViT-G' or clip_pretrained == 'ViT-H':
+            # for OpenCLIP models
+            import open_clip
+            name, pretrain = (
+                'ViT-H-14',
+                'laion2b_s32b_b79k') if clip_pretrained == 'ViT-H' else (
+                    'ViT-bigG-14', 'laion2b_s39b_b160k')
+            try:
+                open_clip_model = open_clip.create_model_and_transforms(
+                    name,
+                    pretrained=pretrain,
+                    device=device,
+                    force_image_size=336,
+                )
+                clip_model, _, clip_preprocess = open_clip_model
+            except ConnectionError or LocalEntryNotFoundError as e:
+                print(f'Has {e} when loading weights from huggingface!')
+                print(
+                    f'Will load {pretrain} weights from {custom_clip_weights}.'
+                )
+                assert custom_clip_weights is not None, 'Please specify custom weights directory.'  # noqa
+                assert os.path.exists(
+                    os.path.join(custom_clip_weights,
+                                 'open_clip_pytorch_model.bin')
+                ), 'Please provide a valid directory for manually downloaded model.'  # noqa
+                open_clip_model = open_clip.create_model_and_transforms(
+                    name,
+                    pretrained=None,
+                    device='cpu',
+                    force_image_size=336,
+                )
+                clip_model, _, clip_preprocess = open_clip_model
+
+                open_clip.load_checkpoint(
+                    clip_model,
+                    os.path.expanduser(
+                        os.path.join(custom_clip_weights,
+                                     'open_clip_pytorch_model.bin')))
+                clip_model.to(torch.device(device))
+
+            self.tokenizer = open_clip.get_tokenizer(name)
+        else:
+            # for OpenAI models
+            clip_model, clip_preprocess = clip_wrapper.load(
+                clip_pretrained,
+                device=device,
+                jit=False,
+                prompt_depth=prompt_depth,
+                prompt_length=prompt_length)
+
+        # pre-encode classes text prompts
+        text_features = self.class_embeddings(self.class_texts,
+                                              prompt_templates, clip_model,
+                                              device).permute(1, 0, 2).float()
+        text_features_test = self.class_embeddings(self.test_class_texts,
+                                                   prompt_templates,
+                                                   clip_model,
+                                                   device).permute(1, 0,
+                                                                   2).float()
+        self.register_buffer('text_features', text_features, False)
+        self.register_buffer('text_features_test', text_features_test, False)
+
+        # prepare CLIP model finetune
+        self.clip_finetune = clip_finetune
+        self.clip_model = clip_model.float()
+        self.clip_preprocess = clip_preprocess
+
+        for name, params in self.clip_model.named_parameters():
+            if 'visual' in name:
+                if clip_finetune == 'prompt':
+                    params.requires_grad = True if 'prompt' in name else False
+                elif clip_finetune == 'attention':
+                    if 'attn' in name or 'position' in name:
+                        params.requires_grad = True
+                    else:
+                        params.requires_grad = False
+                elif clip_finetune == 'full':
+                    params.requires_grad = True
+                else:
+                    params.requires_grad = False
+            else:
+                params.requires_grad = False
+
+        finetune_backbone = backbone_multiplier > 0.
+        for name, params in self.feature_extractor.named_parameters():
+            if 'norm0' in name:
+                params.requires_grad = False
+            else:
+                params.requires_grad = finetune_backbone
+
+    @torch.no_grad()
+    def class_embeddings(self,
+                         classnames,
+                         templates,
+                         clip_model,
+                         device='cpu'):
+        """Convert class names to text embeddings by clip model.
+
+        Args:
+            classnames (list): loaded from json file.
+            templates (dict): text template.
+            clip_model (nn.Module): prepared clip model.
+            device (str | torch.device): loading device of text
+                encoder results.
+        """
+        zeroshot_weights = []
+        for classname in classnames:
+            if ', ' in classname:
+                classname_splits = classname.split(', ')
+                texts = []
+                for template in templates:
+                    for cls_split in classname_splits:
+                        texts.append(template.format(cls_split))
+            else:
+                texts = [template.format(classname)
+                         for template in templates]  # format with class
+            if self.tokenizer is not None:
+                texts = self.tokenizer(texts).to(device)
+            else:
+                texts = clip_wrapper.tokenize(texts).to(device)
+            class_embeddings = clip_model.encode_text(texts)
+            class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
+            if len(templates) != class_embeddings.shape[0]:
+                class_embeddings = class_embeddings.reshape(
+                    len(templates), -1, class_embeddings.shape[-1]).mean(dim=1)
+                class_embeddings /= class_embeddings.norm(dim=-1, keepdim=True)
+            class_embedding = class_embeddings
+            zeroshot_weights.append(class_embedding)
+        zeroshot_weights = torch.stack(zeroshot_weights, dim=1).to(device)
+        return zeroshot_weights
+
+    def custom_normalize(self, inputs):
+        """Input normalization for clip model and feature extractor
+        respectively.
+
+        Args:
+            inputs: batched input images.
+        """
+        # clip images
+        batched_clip = (inputs - self.clip_pixel_mean) / self.clip_pixel_std
+        batched_clip = F.interpolate(
+            batched_clip,
+            size=self.clip_resolution,
+            mode='bilinear',
+            align_corners=False)
+        # feature extractor images
+        batched = (inputs - self.pixel_mean) / self.pixel_std
+        return batched, batched_clip
+
+    def forward(self, inputs):
+        """
+        Args:
+            inputs: minibatch image. (B, 3, H, W)
+        Returns:
+            outputs (dict):
+            'appearance_feat': list[torch.Tensor], w.r.t. out_indices of
+                `self.feature_extractor`.
+            'clip_text_feat': the text feature extracted by clip text encoder.
+            'clip_text_feat_test': the text feature extracted by clip text
+                encoder for testing.
+            'clip_img_feat': the image feature extracted clip image encoder.
+        """
+        inputs, clip_inputs = self.custom_normalize(inputs)
+        outputs = dict()
+        # extract appearance guidance feature
+        outputs['appearance_feat'] = self.feature_extractor(inputs)
+
+        # extract clip features
+        outputs['clip_text_feat'] = self.text_features
+        outputs['clip_text_feat_test'] = self.text_features_test
+        clip_features = self.clip_model.encode_image(
+            clip_inputs, dense=True)  # B, 577(24x24+1), C
+        B = clip_features.size(0)
+        outputs['clip_img_feat'] = clip_features[:, 1:, :].permute(
+            0, 2, 1).reshape(B, -1, *self.clip_img_feat_size)
+
+        return outputs
diff --git a/projects/CAT-Seg/cat_seg/utils/__init__.py b/projects/CAT-Seg/cat_seg/utils/__init__.py
new file mode 100644
index 0000000000..88746b2cba
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/utils/__init__.py
@@ -0,0 +1,10 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .clip_templates import (IMAGENET_TEMPLATES, IMAGENET_TEMPLATES_SELECT,
+                             IMAGENET_TEMPLATES_SELECT_CLIP, ViLD_templates)
+from .self_attention_block import FullAttention, LinearAttention
+
+__all__ = [
+    'FullAttention', 'LinearAttention', 'IMAGENET_TEMPLATES',
+    'IMAGENET_TEMPLATES_SELECT', 'IMAGENET_TEMPLATES_SELECT_CLIP',
+    'ViLD_templates'
+]
diff --git a/projects/CAT-Seg/cat_seg/utils/bpe_vocab/bpe_simple_vocab_16e6.txt.gz b/projects/CAT-Seg/cat_seg/utils/bpe_vocab/bpe_simple_vocab_16e6.txt.gz
new file mode 100644
index 0000000000..7b5088a527
Binary files /dev/null and b/projects/CAT-Seg/cat_seg/utils/bpe_vocab/bpe_simple_vocab_16e6.txt.gz differ
diff --git a/projects/CAT-Seg/cat_seg/utils/clip_model.py b/projects/CAT-Seg/cat_seg/utils/clip_model.py
new file mode 100644
index 0000000000..977444f5b5
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/utils/clip_model.py
@@ -0,0 +1,651 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from collections import OrderedDict
+from typing import Tuple, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+
+class Bottleneck(nn.Module):
+    """Custom implementation of Bottleneck in ResNet."""
+    expansion = 4
+
+    def __init__(self, inplanes, planes, stride=1):
+        super().__init__()
+        # all conv layers have stride 1.
+        # an avgpool is performed after the second convolution when stride > 1
+        self.conv1 = nn.Conv2d(inplanes, planes, 1, bias=False)
+        self.bn1 = nn.BatchNorm2d(planes)
+
+        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(planes)
+
+        self.avgpool = nn.AvgPool2d(stride) if stride > 1 else nn.Identity()
+
+        self.conv3 = nn.Conv2d(planes, planes * self.expansion, 1, bias=False)
+        self.bn3 = nn.BatchNorm2d(planes * self.expansion)
+
+        self.relu = nn.ReLU(inplace=True)
+        self.downsample = None
+        self.stride = stride
+
+        if stride > 1 or inplanes != planes * Bottleneck.expansion:
+            # downsampling layer is prepended with an avgpool,
+            # and the subsequent convolution has stride 1
+            self.downsample = nn.Sequential(
+                OrderedDict([('-1', nn.AvgPool2d(stride)),
+                             ('0',
+                              nn.Conv2d(
+                                  inplanes,
+                                  planes * self.expansion,
+                                  1,
+                                  stride=1,
+                                  bias=False)),
+                             ('1', nn.BatchNorm2d(planes * self.expansion))]))
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): the input feature.
+        """
+        identity = x
+
+        out = self.relu(self.bn1(self.conv1(x)))
+        out = self.relu(self.bn2(self.conv2(out)))
+        out = self.avgpool(out)
+        out = self.bn3(self.conv3(out))
+
+        if self.downsample is not None:
+            identity = self.downsample(x)
+
+        out += identity
+        out = self.relu(out)
+        return out
+
+
+class AttentionPool2d(nn.Module):
+    """Attention Pool2d."""
+
+    def __init__(self,
+                 spacial_dim: int,
+                 embed_dim: int,
+                 num_heads: int,
+                 output_dim: int = None):
+        super().__init__()
+        self.positional_embedding = nn.Parameter(
+            torch.randn(spacial_dim**2 + 1, embed_dim) / embed_dim**0.5)
+        self.k_proj = nn.Linear(embed_dim, embed_dim)
+        self.q_proj = nn.Linear(embed_dim, embed_dim)
+        self.v_proj = nn.Linear(embed_dim, embed_dim)
+        self.c_proj = nn.Linear(embed_dim, output_dim or embed_dim)
+        self.num_heads = num_heads
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): the input feature.
+        """
+        x = x.flatten(start_dim=2).permute(2, 0, 1)  # NCHW -> (HW)NC
+        x = torch.cat([x.mean(dim=0, keepdim=True), x], dim=0)  # (HW+1)NC
+        x = x + self.positional_embedding[:, None, :].to(x.dtype)  # (HW+1)NC
+        x, _ = F.multi_head_attention_forward(
+            query=x[:1],
+            key=x,
+            value=x,
+            embed_dim_to_check=x.shape[-1],
+            num_heads=self.num_heads,
+            q_proj_weight=self.q_proj.weight,
+            k_proj_weight=self.k_proj.weight,
+            v_proj_weight=self.v_proj.weight,
+            in_proj_weight=None,
+            in_proj_bias=torch.cat(
+                [self.q_proj.bias, self.k_proj.bias, self.v_proj.bias]),
+            bias_k=None,
+            bias_v=None,
+            add_zero_attn=False,
+            dropout_p=0,
+            out_proj_weight=self.c_proj.weight,
+            out_proj_bias=self.c_proj.bias,
+            use_separate_proj_weight=True,
+            training=self.training,
+            need_weights=False)
+        return x.squeeze(0)
+
+
+class ModifiedResNet(nn.Module):
+    """A ResNet class that is similar to torchvision's but contains the
+    following changes:
+
+    - There are now 3 "stem" convolutions as opposed to 1, with an average
+        pool instead of a max pool.
+    - Performs anti-aliasing strided convolutions, where an avgpool is
+        prepended to convolutions with stride > 1
+    - The final pooling layer is a QKV attention instead of an average pool
+    """
+
+    def __init__(self,
+                 layers,
+                 output_dim,
+                 heads,
+                 input_resolution=224,
+                 width=64):
+        super().__init__()
+        self.output_dim = output_dim
+        self.input_resolution = input_resolution
+
+        # the 3-layer stem
+        self.conv1 = nn.Conv2d(
+            3, width // 2, kernel_size=3, stride=2, padding=1, bias=False)
+        self.bn1 = nn.BatchNorm2d(width // 2)
+        self.relu1 = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            width // 2, width // 2, kernel_size=3, padding=1, bias=False)
+        self.bn2 = nn.BatchNorm2d(width // 2)
+        self.relu2 = nn.ReLU(inplace=True)
+        self.conv3 = nn.Conv2d(
+            width // 2, width, kernel_size=3, padding=1, bias=False)
+        self.bn3 = nn.BatchNorm2d(width)
+        self.relu3 = nn.ReLU(inplace=True)
+        self.avgpool = nn.AvgPool2d(2)
+
+        # residual layers
+        # this is a *mutable* variable used during construction
+        self._inplanes = width
+        self.layer1 = self._make_layer(width, layers[0])
+        self.layer2 = self._make_layer(width * 2, layers[1], stride=2)
+        self.layer3 = self._make_layer(width * 4, layers[2], stride=2)
+        self.layer4 = self._make_layer(width * 8, layers[3], stride=2)
+
+        embed_dim = width * 32  # the ResNet feature dimension
+        self.attnpool = AttentionPool2d(input_resolution // 32, embed_dim,
+                                        heads, output_dim)
+
+    def _make_layer(self, planes, blocks, stride=1):
+        """Build resnet layers."""
+        layers = [Bottleneck(self._inplanes, planes, stride)]
+
+        self._inplanes = planes * Bottleneck.expansion
+        for _ in range(1, blocks):
+            layers.append(Bottleneck(self._inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        """
+        Args:
+            x (torch.Tensor): the input mini-batch images.
+        """
+
+        def stem(x):
+            x = self.relu1(self.bn1(self.conv1(x)))
+            x = self.relu2(self.bn2(self.conv2(x)))
+            x = self.relu3(self.bn3(self.conv3(x)))
+            x = self.avgpool(x)
+            return x
+
+        x = x.type(self.conv1.weight.dtype)
+        x = stem(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        x = self.attnpool(x)
+
+        return x
+
+
+class LayerNorm(nn.LayerNorm):
+    """Subclass torch's LayerNorm to handle fp16."""
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): the input feature.
+        """
+        orig_type = x.dtype
+        ret = super().forward(x.type(torch.float32))
+        return ret.type(orig_type)
+
+
+class QuickGELU(nn.Module):
+    """Wrapper of GELU activation layer."""
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): the input feature.
+        """
+        return x * torch.sigmoid(1.702 * x)
+
+
+class ResidualAttentionBlock(nn.Module):
+    """Attention block with residual connection."""
+
+    def __init__(self,
+                 d_model: int,
+                 n_head: int,
+                 attn_mask: torch.Tensor = None):
+        super().__init__()
+
+        self.attn = nn.MultiheadAttention(d_model, n_head)
+        self.ln_1 = LayerNorm(d_model)
+        self.mlp = nn.Sequential(
+            OrderedDict([('c_fc', nn.Linear(d_model, d_model * 4)),
+                         ('gelu', QuickGELU()),
+                         ('c_proj', nn.Linear(d_model * 4, d_model))]))
+        self.ln_2 = LayerNorm(d_model)
+        self.attn_mask = attn_mask
+        self.mask_pre_mlp = True
+
+    def attention(self, x: torch.Tensor):
+        """Calculate mask multi-head-attention."""
+        self.attn_mask = self.attn_mask.to(
+            dtype=x.dtype,
+            device=x.device) if self.attn_mask is not None else None
+        return self.attn(
+            x, x, x, need_weights=False, attn_mask=self.attn_mask)[0]
+
+    def forward(self, x: torch.Tensor):
+        """
+        Args:
+            x (torch.Tensor): the input feature.
+        """
+        x = x + self.attention(self.ln_1(x))
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+    def forward_dense(self, x: torch.Tensor):
+        """Reinplementation of forward function for dense prediction of image
+        encoder in CLIP model.
+
+        Args:
+            x (torch.Tensor): the input feature.
+        """
+        y = self.ln_1(x)
+        y = F.linear(y, self.attn.in_proj_weight, self.attn.in_proj_bias)
+        L, N, D = y.shape  # L N 3D
+
+        y = y.reshape(L, N, 3, D // 3).permute(2, 1, 0,
+                                               3).reshape(3 * N, L, D // 3)
+        y = F.linear(y, self.attn.out_proj.weight, self.attn.out_proj.bias)
+
+        q, k, v = y.tensor_split(3, dim=0)
+        v = v.transpose(1, 0) + x  # L N D
+
+        v = v + self.mlp(self.ln_2(v))
+        return v
+
+
+class Transformer(nn.Module):
+    """General Transformer Architecture for both image and text encoder."""
+
+    def __init__(self,
+                 width: int,
+                 layers: int,
+                 heads: int,
+                 attn_mask: torch.Tensor = None,
+                 prompt_length=0,
+                 prompt_depth=0):
+        super().__init__()
+        self.width = width
+        self.layers = layers
+        self.resblocks = nn.Sequential(*[
+            ResidualAttentionBlock(width, heads, attn_mask)
+            for _ in range(layers)
+        ])
+
+        self.prompt_length = prompt_length
+        self.prompt_depth = prompt_depth
+        self.prompt_tokens = nn.Parameter(
+            torch.zeros(prompt_depth, prompt_length,
+                        width)) if prompt_length > 0 else None
+        if self.prompt_tokens is not None:
+            nn.init.xavier_uniform_(self.prompt_tokens)
+
+    def forward(self, x: torch.Tensor, dense=False):
+        """
+        Args:
+            x (torch.Tensor): input features.
+            dense (bool): whether use reimplemented dense forward
+                function in the last layer.
+        """
+        for i, resblock in enumerate(self.resblocks):
+            if self.prompt_length > 0 and i < self.prompt_depth:
+                length = self.prompt_length + 1 if i > 0 else 1
+                x = torch.cat((x[0:1, :, :], self.prompt_tokens[i].repeat(
+                    x.shape[1], 1, 1).permute(1, 0, 2), x[length:, :, :]))
+
+            if i == self.layers - 1 and dense:
+                x = resblock.forward_dense(x)
+                x = torch.cat((x[0:1, :, :], x[self.prompt_length + 1::, :]),
+                              dim=0)
+            else:
+                x = resblock(x)
+
+        return x
+
+
+class VisualTransformer(nn.Module):
+    """Visual encoder for CLIP model."""
+
+    def __init__(self, input_resolution: int, patch_size: int, width: int,
+                 layers: int, heads: int, output_dim: int, prompt_depth: int,
+                 prompt_length: int):
+        super().__init__()
+        self.output_dim = output_dim
+        self.conv1 = nn.Conv2d(
+            in_channels=3,
+            out_channels=width,
+            kernel_size=patch_size,
+            stride=patch_size,
+            bias=False)
+
+        scale = width**-0.5
+        self.class_embedding = nn.Parameter(scale * torch.randn(width))
+        self.positional_embedding = nn.Parameter(scale * torch.randn(
+            (input_resolution // patch_size)**2 + 1, width))
+        self.ln_pre = LayerNorm(width)
+
+        self.transformer = Transformer(
+            width,
+            layers,
+            heads,
+            prompt_depth=prompt_depth,
+            prompt_length=prompt_length)
+
+        self.ln_post = LayerNorm(width)
+        self.proj = nn.Parameter(scale * torch.randn(width, output_dim))
+
+        self.patch_size = patch_size
+        self.input_resolution = input_resolution
+
+    def forward(self, x: torch.Tensor, dense=False):
+        """
+        Args:
+            x (torch.Tensor): input features.
+            dense (bool): whether use reimplemented dense forward
+                function in the last layer.
+        """
+        x = self.conv1(x)  # shape = [*, width, grid, grid]
+        x = x.reshape(x.shape[0], x.shape[1],
+                      -1)  # shape = [*, width, grid ** 2]
+        x = x.permute(0, 2, 1)  # shape = [*, grid ** 2, width]
+        x = torch.cat([
+            self.class_embedding.to(x.dtype) + torch.zeros(
+                x.shape[0], 1, x.shape[-1], dtype=x.dtype, device=x.device), x
+        ],
+                      dim=1)  # shape = [*, grid ** 2 + 1, width]
+
+        if dense and (x.shape[1] != self.positional_embedding.shape[0]):
+            x = x + self.resized_pos_embed(self.input_resolution,
+                                           x.shape[1]).to(x.dtype)
+        else:
+            x = x + self.positional_embedding.to(x.dtype)
+
+        x = self.ln_pre(x)
+
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x, dense)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+
+        if dense:
+            x = self.ln_post(x[:, :, :])
+        else:
+            x = self.ln_post(x[:, 0, :])
+
+        if self.proj is not None:
+            x = x @ self.proj
+
+        return x
+
+    def resized_pos_embed(self, in_res, tgt_res, mode='bicubic'):
+        """Resize the position embedding."""
+        # assert L == (input_resolution // self.patch_size) ** 2 + 1
+        L, D = self.positional_embedding.shape
+
+        in_side = in_res // self.patch_size
+        # tgt_side = tgt_res // self.patch_size
+        tgt_side = int((tgt_res - 1)**0.5)
+
+        cls_pos = self.positional_embedding[0].unsqueeze(0)  # 1 D
+        pos_embed = self.positional_embedding[1:].reshape(
+            1, in_side, in_side, D).permute(0, 3, 1, 2)  # L-1 D -> 1 D S S
+        resized_pos_embed = F.interpolate(
+            pos_embed,
+            size=(tgt_side, tgt_side),
+            mode=mode,
+            align_corners=False,
+        )  # 1 D S S -> 1 D S' S'
+        resized_pos_embed = resized_pos_embed.squeeze(0).reshape(
+            D, -1).T  # L'-1 D
+
+        return torch.cat((cls_pos, resized_pos_embed), dim=0)
+
+
+class CLIP(nn.Module):
+    """Custom implementation of CLIP model.
+
+    Refer to: https://github.com/openai/CLIP
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        # vision
+        image_resolution: int,
+        vision_layers: Union[Tuple[int, int, int, int], int],
+        vision_width: int,
+        vision_patch_size: int,
+        # text
+        context_length: int,
+        vocab_size: int,
+        transformer_width: int,
+        transformer_heads: int,
+        transformer_layers: int,
+        # prompt
+        prompt_depth: int = 0,
+        prompt_length: int = 0,
+    ):
+        super().__init__()
+
+        self.context_length = context_length
+
+        self.image_resolution = image_resolution
+
+        if isinstance(vision_layers, (tuple, list)):
+            assert prompt_length == 0 and prompt_depth == 0
+            vision_heads = vision_width * 32 // 64
+            self.visual = ModifiedResNet(
+                layers=vision_layers,
+                output_dim=embed_dim,
+                heads=vision_heads,
+                input_resolution=image_resolution,
+                width=vision_width)
+        else:
+            vision_heads = vision_width // 64
+            self.visual = VisualTransformer(
+                input_resolution=image_resolution,
+                patch_size=vision_patch_size,
+                width=vision_width,
+                layers=vision_layers,
+                heads=vision_heads,
+                output_dim=embed_dim,
+                prompt_depth=prompt_depth,
+                prompt_length=prompt_length,
+            )
+
+        self.transformer = Transformer(
+            width=transformer_width,
+            layers=transformer_layers,
+            heads=transformer_heads,
+            attn_mask=self.build_attention_mask())
+
+        self.vocab_size = vocab_size
+        self.token_embedding = nn.Embedding(vocab_size, transformer_width)
+        self.positional_embedding = nn.Parameter(
+            torch.empty(self.context_length, transformer_width))
+        self.ln_final = LayerNorm(transformer_width)
+
+        self.text_projection = nn.Parameter(
+            torch.empty(transformer_width, embed_dim))
+        self.logit_scale = nn.Parameter(torch.ones([]))
+
+    def build_attention_mask(self):
+        """Create causal attention mask."""
+        # lazily create causal attention mask, with full attention between
+        # the vision tokens pytorch uses additive attention mask; fill with
+        # -inf
+        mask = torch.empty(self.context_length, self.context_length)
+        mask.fill_(float('-inf'))
+        mask.triu_(1)  # zero out the lower diagonal
+        return mask
+
+    @property
+    def dtype(self):
+        """Return the dtype of the model."""
+        return self.visual.conv1.weight.dtype
+
+    def encode_image(self, image, masks=None, pool_mask=None, dense=False):
+        """Image encoding."""
+        if pool_mask is not None:
+            return self.visual(
+                image.type(self.dtype), mask=pool_mask, dense=dense)
+        if masks is None:
+            return self.visual(image.type(self.dtype), dense=dense)
+        else:
+            return self.visual(image.type(self.dtype), masks.type(self.dtype))
+
+    def encode_text(self, text):
+        """Texts encoding."""
+        x = self.token_embedding(text).type(
+            self.dtype)  # [batch_size, n_ctx, d_model]
+
+        x = x + self.positional_embedding.type(self.dtype)
+        x = x.permute(1, 0, 2)  # NLD -> LND
+        x = self.transformer(x)
+        x = x.permute(1, 0, 2)  # LND -> NLD
+        x = self.ln_final(x).type(self.dtype)
+
+        # x.shape = [batch_size, n_ctx, transformer.width]
+        # take features from the eot embedding (eot_token is the highest number
+        # in each sequence)
+        x = x[torch.arange(x.shape[0]),
+              text.argmax(dim=-1)] @ self.text_projection
+
+        return x
+
+    def forward(self, image, text):
+        """
+        Args:
+            image (torch.Tensor): input images.
+            text (torch.Tensor): input text.
+        """
+        image_features = self.encode_image(image)
+        text_features = self.encode_text(text)
+        # import pdb; pdb.set_trace()
+        # normalized features
+        # image_features shape: [1, 1024]
+        image_features = image_features / image_features.norm(
+            dim=-1, keepdim=True)
+        text_features = text_features / text_features.norm(
+            dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp()
+        logits_per_iamge = logit_scale * image_features @ text_features.t()
+        logits_per_text = logit_scale * text_features @ image_features.t()
+
+        # shape = [global_batch_size, global_batch_size]
+        return logits_per_iamge, logits_per_text
+
+
+def convert_weights(model: nn.Module):
+    """Convert applicable model parameters to fp16."""
+
+    def _convert_weights_to_fp16(layer):
+        if isinstance(layer, (nn.Conv1d, nn.Conv2d, nn.Linear)):
+            layer.weight.data = layer.weight.data.half()
+            if layer.bias is not None:
+                layer.bias.data = layer.bias.data.half()
+
+        if isinstance(layer, nn.MultiheadAttention):
+            for attr in [
+                    *[f'{s}_proj_weight' for s in ['in', 'q', 'k', 'v']],
+                    'in_proj_bias', 'bias_k', 'bias_v'
+            ]:
+                tensor = getattr(layer, attr)
+                if tensor is not None:
+                    tensor.data = tensor.data.half()
+
+        for name in ['text_projection', 'proj']:
+            if hasattr(layer, name):
+                attr = getattr(layer, name)
+                if attr is not None:
+                    attr.data = attr.data.half()
+
+    model.apply(_convert_weights_to_fp16)
+
+
+def build_model(state_dict: dict, prompt_depth=0, prompt_length=0):
+    """Build a CLIP model from given pretrained weights."""
+    vit = 'visual.proj' in state_dict
+
+    if vit:
+        vision_width = state_dict['visual.conv1.weight'].shape[0]
+        vision_layers = len([
+            k for k in state_dict.keys()
+            if k.startswith('visual.') and k.endswith('.attn.in_proj_weight')
+        ])
+        vision_patch_size = state_dict['visual.conv1.weight'].shape[-1]
+        grid_size = round(
+            (state_dict['visual.positional_embedding'].shape[0] - 1)**0.5)
+        image_resolution = vision_patch_size * grid_size
+    else:
+        counts: list = [
+            len({
+                k.split('.')[2]
+                for k in state_dict if k.startswith(f'visual.layer{b}')
+            }) for b in [1, 2, 3, 4]
+        ]
+        vision_layers = tuple(counts)
+        vision_width = state_dict['visual.layer1.0.conv1.weight'].shape[0]
+        output_width = round(
+            (state_dict['visual.attnpool.positional_embedding'].shape[0] -
+             1)**0.5)
+        vision_patch_size = None
+        assert output_width**2 + 1 == state_dict[
+            'visual.attnpool.positional_embedding'].shape[0]
+        image_resolution = output_width * 32
+
+    embed_dim = state_dict['text_projection'].shape[1]
+    context_length = state_dict['positional_embedding'].shape[0]
+    vocab_size = state_dict['token_embedding.weight'].shape[0]
+    transformer_width = state_dict['ln_final.weight'].shape[0]
+    transformer_heads = transformer_width // 64
+    transformer_layers = len({
+        k.split('.')[2]
+        for k in state_dict if k.startswith('transformer.resblocks')
+    })
+
+    model = CLIP(
+        embed_dim,
+        image_resolution,
+        vision_layers,
+        vision_width,
+        vision_patch_size,
+        context_length,
+        vocab_size,
+        transformer_width,
+        transformer_heads,
+        transformer_layers,
+        prompt_depth=prompt_depth,
+        prompt_length=prompt_length,
+    )
+
+    for key in ['input_resolution', 'context_length', 'vocab_size']:
+        del state_dict[key]
+
+    convert_weights(model)
+    model.load_state_dict(state_dict, strict=False)
+    return model.eval()
diff --git a/projects/CAT-Seg/cat_seg/utils/clip_templates.py b/projects/CAT-Seg/cat_seg/utils/clip_templates.py
new file mode 100644
index 0000000000..bfc32dfc56
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/utils/clip_templates.py
@@ -0,0 +1,204 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+# Source: https://github.com/openai/CLIP.
+
+IMAGENET_TEMPLATES = [
+    'a bad photo of a {}.',
+    'a photo of many {}.',
+    'a sculpture of a {}.',
+    'a photo of the hard to see {}.',
+    'a low resolution photo of the {}.',
+    'a rendering of a {}.',
+    'graffiti of a {}.',
+    'a bad photo of the {}.',
+    'a cropped photo of the {}.',
+    'a tattoo of a {}.',
+    'the embroidered {}.',
+    'a photo of a hard to see {}.',
+    'a bright photo of a {}.',
+    'a photo of a clean {}.',
+    'a photo of a dirty {}.',
+    'a dark photo of the {}.',
+    'a drawing of a {}.',
+    'a photo of my {}.',
+    'the plastic {}.',
+    'a photo of the cool {}.',
+    'a close-up photo of a {}.',
+    'a black and white photo of the {}.',
+    'a painting of the {}.',
+    'a painting of a {}.',
+    'a pixelated photo of the {}.',
+    'a sculpture of the {}.',
+    'a bright photo of the {}.',
+    'a cropped photo of a {}.',
+    'a plastic {}.',
+    'a photo of the dirty {}.',
+    'a jpeg corrupted photo of a {}.',
+    'a blurry photo of the {}.',
+    'a photo of the {}.',
+    'a good photo of the {}.',
+    'a rendering of the {}.',
+    'a {} in a video game.',
+    'a photo of one {}.',
+    'a doodle of a {}.',
+    'a close-up photo of the {}.',
+    'a photo of a {}.',
+    'the origami {}.',
+    'the {} in a video game.',
+    'a sketch of a {}.',
+    'a doodle of the {}.',
+    'a origami {}.',
+    'a low resolution photo of a {}.',
+    'the toy {}.',
+    'a rendition of the {}.',
+    'a photo of the clean {}.',
+    'a photo of a large {}.',
+    'a rendition of a {}.',
+    'a photo of a nice {}.',
+    'a photo of a weird {}.',
+    'a blurry photo of a {}.',
+    'a cartoon {}.',
+    'art of a {}.',
+    'a sketch of the {}.',
+    'a embroidered {}.',
+    'a pixelated photo of a {}.',
+    'itap of the {}.',
+    'a jpeg corrupted photo of the {}.',
+    'a good photo of a {}.',
+    'a plushie {}.',
+    'a photo of the nice {}.',
+    'a photo of the small {}.',
+    'a photo of the weird {}.',
+    'the cartoon {}.',
+    'art of the {}.',
+    'a drawing of the {}.',
+    'a photo of the large {}.',
+    'a black and white photo of a {}.',
+    'the plushie {}.',
+    'a dark photo of a {}.',
+    'itap of a {}.',
+    'graffiti of the {}.',
+    'a toy {}.',
+    'itap of my {}.',
+    'a photo of a cool {}.',
+    'a photo of a small {}.',
+    'a tattoo of the {}.',
+    # 'A photo of a {} in the scene.',
+]
+
+# v1: 59.0875
+IMAGENET_TEMPLATES_SELECT = [
+    'itap of a {}.',
+    'a bad photo of the {}.',
+    'a origami {}.',
+    'a photo of the large {}.',
+    'a {} in a video game.',
+    'art of the {}.',
+    'a photo of the small {}.',
+    'A photo of a {} in the scene',
+]
+
+# v9
+IMAGENET_TEMPLATES_SELECT_CLIP = [
+    'a bad photo of the {}.',
+    'a photo of the large {}.',
+    'a photo of the small {}.',
+    'a cropped photo of a {}.',
+    'This is a photo of a {}',
+    'This is a photo of a small {}',
+    'This is a photo of a medium {}',
+    'This is a photo of a large {}',
+    'This is a masked photo of a {}',
+    'This is a masked photo of a small {}',
+    'This is a masked photo of a medium {}',
+    'This is a masked photo of a large {}',
+    'This is a cropped photo of a {}',
+    'This is a cropped photo of a small {}',
+    'This is a cropped photo of a medium {}',
+    'This is a cropped photo of a large {}',
+    'A photo of a {} in the scene',
+    'a bad photo of the {} in the scene',
+    'a photo of the large {} in the scene',
+    'a photo of the small {} in the scene',
+    'a cropped photo of a {} in the scene',
+    'a photo of a masked {} in the scene',
+    'There is a {} in the scene',
+    'There is the {} in the scene',
+    'This is a {} in the scene',
+    'This is the {} in the scene',
+    'This is one {} in the scene',
+    'There is a masked {} in the scene',
+    'There is the masked {} in the scene',
+    'This is a masked {} in the scene',
+    'This is the masked {} in the scene',
+    'This is one masked {} in the scene',
+]
+
+# v10, for comparison
+# IMAGENET_TEMPLATES_SELECT_CLIP = [
+#     'a photo of a {}.',
+#
+#     'This is a photo of a {}',
+#     'This is a photo of a small {}',
+#     'This is a photo of a medium {}',
+#     'This is a photo of a large {}',
+#
+#     'This is a photo of a {}',
+#     'This is a photo of a small {}',
+#     'This is a photo of a medium {}',
+#     'This is a photo of a large {}',
+#
+#     'a photo of a {} in the scene',
+#     'a photo of a {} in the scene',
+#
+#     'There is a {} in the scene',
+#     'There is the {} in the scene',
+#     'This is a {} in the scene',
+#     'This is the {} in the scene',
+#     'This is one {} in the scene',
+# ]
+
+ViLD_templates = [
+    'There is {article} {category} in the scene.',
+    'There is the {category} in the scene.',
+    'a photo of {article} {category} in the scene.',
+    'a photo of the {category} in the scene.',
+    'a photo of one {category} in the scene.', 'itap of {article} {category}.',
+    'itap of my {category}.', 'itap of the {category}.',
+    'a photo of {article} {category}.', 'a photo of my {category}.',
+    'a photo of the {category}.', 'a photo of one {category}.',
+    'a photo of many {category}.', 'a good photo of {article} {category}.',
+    'a good photo of the {category}.', 'a bad photo of {article} {category}.',
+    'a bad photo of the {category}.', 'a photo of a nice {category}.',
+    'a photo of the nice {category}.', 'a photo of a cool {category}.',
+    'a photo of the cool {category}.', 'a photo of a weird {category}.',
+    'a photo of the weird {category}.', 'a photo of a small {category}.',
+    'a photo of the small {category}.', 'a photo of a large {category}.',
+    'a photo of the large {category}.', 'a photo of a clean {category}.',
+    'a photo of the clean {category}.', 'a photo of a dirty {category}.',
+    'a photo of the dirty {category}.',
+    'a bright photo of {article} {category}.',
+    'a bright photo of the {category}.',
+    'a dark photo of {article} {category}.', 'a dark photo of the {category}.',
+    'a photo of a hard to see {category}.',
+    'a photo of the hard to see {category}.',
+    'a low resolution photo of {article} {category}.',
+    'a low resolution photo of the {category}.',
+    'a cropped photo of {article} {category}.',
+    'a cropped photo of the {category}.',
+    'a close-up photo of {article} {category}.',
+    'a close-up photo of the {category}.',
+    'a jpeg corrupted photo of {article} {category}.',
+    'a jpeg corrupted photo of the {category}.',
+    'a blurry photo of {article} {category}.',
+    'a blurry photo of the {category}.',
+    'a pixelated photo of {article} {category}.',
+    'a pixelated photo of the {category}.',
+    'a black and white photo of the {category}.',
+    'a black and white photo of {article} {category}.',
+    'a plastic {category}.', 'the plastic {category}.', 'a toy {category}.',
+    'the toy {category}.', 'a plushie {category}.', 'the plushie {category}.',
+    'a cartoon {category}.', 'the cartoon {category}.',
+    'an embroidered {category}.', 'the embroidered {category}.',
+    'a painting of the {category}.', 'a painting of a {category}.'
+]
diff --git a/projects/CAT-Seg/cat_seg/utils/clip_wrapper.py b/projects/CAT-Seg/cat_seg/utils/clip_wrapper.py
new file mode 100644
index 0000000000..f809d2b828
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/utils/clip_wrapper.py
@@ -0,0 +1,275 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+# Referred to: https://github.com/KU-CVLAB/CAT-Seg/blob/main/cat_seg/third_party/clip.py # noqa
+import hashlib
+import os
+import urllib
+import warnings
+from typing import List, Union
+
+import torch
+from PIL import Image
+from torchvision.transforms import (CenterCrop, Compose, Normalize, Resize,
+                                    ToTensor)
+from tqdm import tqdm
+
+from .clip_model import build_model
+from .tokenizer import SimpleTokenizer as _Tokenizer
+
+__all__ = ['available_models', 'load', 'tokenize']
+_tokenizer = _Tokenizer()
+
+_MODELS = {
+    'RN50':
+    'https://openaipublic.azureedge.net/clip/models/afeb0e10f9e5a86da6080e35cf09123aca3b358a0c3e3b6c78a7b63bc04b6762/RN50.pt',  # noqa
+    'RN101':
+    'https://openaipublic.azureedge.net/clip/models/8fa8567bab74a42d41c5915025a8e4538c3bdbe8804a470a72f30b0d94fab599/RN101.pt',  # noqa
+    'RN50x4':
+    'https://openaipublic.azureedge.net/clip/models/7e526bd135e493cef0776de27d5f42653e6b4c8bf9e0f653bb11773263205fdd/RN50x4.pt',  # noqa
+    'RN50x16':
+    'https://openaipublic.azureedge.net/clip/models/52378b407f34354e150460fe41077663dd5b39c54cd0bfd2b27167a4a06ec9aa/RN50x16.pt',  # noqa
+    'RN50x64':
+    'https://openaipublic.azureedge.net/clip/models/be1cfb55d75a9666199fb2206c106743da0f6468c9d327f3e0d0a543a9919d9c/RN50x64.pt',  # noqa
+    'ViT-B/32':
+    'https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt',  # noqa
+    'ViT-B/16':
+    'https://openaipublic.azureedge.net/clip/models/5806e77cd80f8b59890b7e101eabd078d9fb84e6937f9e85e4ecb61988df416f/ViT-B-16.pt',  # noqa
+    'ViT-L/14':
+    'https://openaipublic.azureedge.net/clip/models/b8cca3fd41ae0c99ba7e8951adf17d267cdb84cd88be6f7c2e0eca1737a03836/ViT-L-14.pt',  # noqa
+    'ViT-L/14@336px':
+    'https://openaipublic.azureedge.net/clip/models/3035c92b350959924f9f00213499208652fc7ea050643e8b385c2dac08641f02/ViT-L-14-336px.pt',  # noqa
+}
+
+
+def _download(url: str, root: str = os.path.expanduser('~/.cache/clip')):
+    """Download clip pretrained weights."""
+    os.makedirs(root, exist_ok=True)
+    filename = os.path.basename(url)
+
+    expected_sha256 = url.split('/')[-2]
+    download_target = os.path.join(root, filename)
+
+    if os.path.exists(download_target) and not os.path.isfile(download_target):
+        raise RuntimeError(
+            f'{download_target} exists and is not a regular file')
+
+    if os.path.isfile(download_target):
+        if hashlib.sha256(open(download_target,
+                               'rb').read()).hexdigest() == expected_sha256:
+            return download_target
+        else:
+            warnings.warn(
+                f'{download_target} exists, but the SHA256 checksum does not\
+                match; re-downloading the file')
+
+    with urllib.request.urlopen(url) as source, open(download_target,
+                                                     'wb') as output:
+        with tqdm(
+                total=int(source.info().get('Content-Length')),
+                ncols=80) as loop:
+            while True:
+                buffer = source.read(8192)
+                if not buffer:
+                    break
+
+                output.write(buffer)
+                loop.update(len(buffer))
+
+    if hashlib.sha256(open(download_target,
+                           'rb').read()).hexdigest() != expected_sha256:
+        raise RuntimeError(
+            'Model has been downloaded but the SHA256 checksum does not not\
+                match')
+
+    return download_target
+
+
+def available_models():
+    """Returns a list of available models."""
+    return list(_MODELS.keys())
+
+
+def load(name: str,
+         device: Union[str, torch.device] = 'cuda'
+         if torch.cuda.is_available() else 'cpu',
+         jit=True,
+         prompt_depth=0,
+         prompt_length=0):
+    """Load target clip model."""
+    if name not in _MODELS:
+        raise RuntimeError(
+            f'Model {name} not found; available models = {available_models()}')
+
+    model_path = _download(_MODELS[name])
+    model = torch.jit.load(
+        model_path, map_location=device if jit else 'cpu').eval()
+    n_px = model.input_resolution.item()
+
+    transform = Compose([
+        Resize(n_px, interpolation=Image.BICUBIC),
+        CenterCrop(n_px),
+        lambda image: image.convert('RGB'),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+    if not jit:
+        model = build_model(model.state_dict(), prompt_depth,
+                            prompt_length).to(device)
+        return model, transform
+
+    # patch the device names
+    device_holder = torch.jit.trace(
+        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [
+        n for n in device_holder.graph.findAllNodes('prim::Constant')
+        if 'Device' in repr(n)
+    ][-1]
+
+    def patch_device(module):
+        graphs = [module.graph] if hasattr(module, 'graph') else []
+        if hasattr(module, 'forward1'):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes('prim::Constant'):
+                if 'value' in node.attributeNames() and str(
+                        node['value']).startswith('cuda'):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if device == 'cpu':
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            graphs = [module.graph] if hasattr(module, 'graph') else []
+            if hasattr(module, 'forward1'):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes('aten::to'):
+                    inputs = list(node.inputs())
+                    for i in [1, 2]:
+                        # dtype can be the second or third argument to
+                        # aten::to()
+                        if inputs[i].node()['value'] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model, transform
+
+
+def load_custom(name: str,
+                device: Union[str, torch.device] = 'cuda'
+                if torch.cuda.is_available() else 'cpu',
+                jit=True,
+                n_px=224):
+    """Load a customized clip model."""
+    if name not in _MODELS:
+        raise RuntimeError(
+            f'Model {name} not found; available models = {available_models()}')
+
+    model_path = _download(_MODELS[name])
+    model = torch.jit.load(
+        model_path, map_location=device if jit else 'cpu').eval()
+    # n_px = model.input_resolution.item()
+
+    transform = Compose([
+        Resize(n_px, interpolation=Image.BICUBIC),
+        CenterCrop(n_px),
+        lambda image: image.convert('RGB'),
+        ToTensor(),
+        Normalize((0.48145466, 0.4578275, 0.40821073),
+                  (0.26862954, 0.26130258, 0.27577711)),
+    ])
+
+    if not jit:
+        model = build_model(model.state_dict()).to(device)
+        return model, transform
+
+    # patch the device names
+    device_holder = torch.jit.trace(
+        lambda: torch.ones([]).to(torch.device(device)), example_inputs=[])
+    device_node = [
+        n for n in device_holder.graph.findAllNodes('prim::Constant')
+        if 'Device' in repr(n)
+    ][-1]
+
+    def patch_device(module):
+        graphs = [module.graph] if hasattr(module, 'graph') else []
+        if hasattr(module, 'forward1'):
+            graphs.append(module.forward1.graph)
+
+        for graph in graphs:
+            for node in graph.findAllNodes('prim::Constant'):
+                if 'value' in node.attributeNames() and str(
+                        node['value']).startswith('cuda'):
+                    node.copyAttributes(device_node)
+
+    model.apply(patch_device)
+    patch_device(model.encode_image)
+    patch_device(model.encode_text)
+
+    # patch dtype to float32 on CPU
+    if device == 'cpu':
+        float_holder = torch.jit.trace(
+            lambda: torch.ones([]).float(), example_inputs=[])
+        float_input = list(float_holder.graph.findNode('aten::to').inputs())[1]
+        float_node = float_input.node()
+
+        def patch_float(module):
+            graphs = [module.graph] if hasattr(module, 'graph') else []
+            if hasattr(module, 'forward1'):
+                graphs.append(module.forward1.graph)
+
+            for graph in graphs:
+                for node in graph.findAllNodes('aten::to'):
+                    inputs = list(node.inputs())
+                    for i in [
+                            1, 2
+                    ]:  # dtype can be the second or third argument to
+                        # aten::to()
+                        if inputs[i].node()['value'] == 5:
+                            inputs[i].node().copyAttributes(float_node)
+
+        model.apply(patch_float)
+        patch_float(model.encode_image)
+        patch_float(model.encode_text)
+
+        model.float()
+
+    return model, transform
+
+
+def tokenize(texts: Union[str, List[str]], context_length: int = 77):
+    """Convert texts to tokens."""
+    if isinstance(texts, str):
+        texts = [texts]
+
+    sot_token = _tokenizer.encoder['<|startoftext|>']
+    eot_token = _tokenizer.encoder['<|endoftext|>']
+    # encode each template text phrase
+    all_tokens = [[sot_token] + _tokenizer.encode(text) + [eot_token]
+                  for text in texts]
+    result = torch.zeros(len(all_tokens), context_length, dtype=torch.long)
+
+    for i, tokens in enumerate(all_tokens):
+        if len(tokens) > context_length:
+            raise RuntimeError(
+                f'Input {texts[i]} is too long for context length\
+                    {context_length}')
+        result[i, :len(tokens)] = torch.tensor(tokens)
+
+    return result
diff --git a/projects/CAT-Seg/cat_seg/utils/self_attention_block.py b/projects/CAT-Seg/cat_seg/utils/self_attention_block.py
new file mode 100644
index 0000000000..1c06cbd99e
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/utils/self_attention_block.py
@@ -0,0 +1,79 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from torch import nn as nn
+from torch.nn import functional as F
+
+
+class LinearAttention(nn.Module):
+    """Multi-Head linear attention proposed in "Transformers are RNNs".
+
+    Source: https://github.com/KU-CVLAB/CAT-Seg/blob/main/cat_seg/modeling/transformer/model.py#L247 # noqa
+    """
+
+    def __init__(self, eps=1e-6):
+        super().__init__()
+        self.eps = eps
+
+    def forward(self, queries, keys, values):
+        """
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+        Q = F.elu(queries) + 1
+        K = F.elu(keys) + 1
+
+        v_length = values.size(1)
+        values = values / v_length  # prevent fp16 overflow
+        KV = torch.einsum('nshd,nshv->nhdv', K, values)  # (S,D)' @ S,V
+        Z = 1 / (torch.einsum('nlhd,nhd->nlh', Q, K.sum(dim=1)) + self.eps)
+        queried_values = torch.einsum('nlhd,nhdv,nlh->nlhv', Q, KV,
+                                      Z) * v_length
+
+        return queried_values.contiguous()
+
+
+class FullAttention(nn.Module):
+    """Multi-head scaled dot-product attention, a.k.a full attention.
+
+    Source: https://github.com/KU-CVLAB/CAT-Seg/blob/main/cat_seg/modeling/transformer/model.py#L276 # noqa
+    """
+
+    def __init__(self, use_dropout=False, attention_dropout=0.1):
+        super().__init__()
+        self.use_dropout = use_dropout
+        self.dropout = nn.Dropout(attention_dropout)
+
+    def forward(self, queries, keys, values, q_mask=None, kv_mask=None):
+        """
+        Args:
+            queries: [N, L, H, D]
+            keys: [N, S, H, D]
+            values: [N, S, H, D]
+            q_mask: [N, L]
+            kv_mask: [N, S]
+        Returns:
+            queried_values: (N, L, H, D)
+        """
+
+        # Compute the unnormalized attention and apply the masks
+        QK = torch.einsum('nlhd,nshd->nlsh', queries, keys)
+        if kv_mask is not None:
+            QK.masked_fill_(
+                ~(q_mask[:, :, None, None] * kv_mask[:, None, :, None]),
+                float('-inf'))
+
+        # Compute the attention and the weighted average
+        softmax_temp = 1. / queries.size(3)**.5  # sqrt(D)
+        A = torch.softmax(softmax_temp * QK, dim=2)
+        if self.use_dropout:
+            A = self.dropout(A)
+
+        queried_values = torch.einsum('nlsh,nshd->nlhd', A, values)
+
+        return queried_values.contiguous()
diff --git a/projects/CAT-Seg/cat_seg/utils/tokenizer.py b/projects/CAT-Seg/cat_seg/utils/tokenizer.py
new file mode 100644
index 0000000000..c84711b067
--- /dev/null
+++ b/projects/CAT-Seg/cat_seg/utils/tokenizer.py
@@ -0,0 +1,160 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import gzip
+import html
+import os
+from functools import lru_cache
+
+import ftfy
+import regex as re
+
+
+@lru_cache()
+def default_bpe():
+    """Return default BPE vocabulary path."""
+    return os.path.join(
+        os.path.dirname(os.path.abspath(__file__)),
+        'bpe_vocab/bpe_simple_vocab_16e6.txt.gz')
+
+
+@lru_cache()
+def bytes_to_unicode():
+    """Returns list of utf-8 byte and a corresponding list of unicode strings.
+
+    The reversible bpe codes work on unicode strings. This means you need a
+    large # of unicode characters in your vocab if you want to avoid UNKs. When
+    you're at something like a 10B token dataset you end up needing around 5K
+    for decent coverage. This is a significant percentage of your normal, say,
+    32K bpe vocab. To avoid that, we want lookup tables between utf-8 bytes and
+    unicode strings. And avoids mapping to whitespace/control characters the
+    bpe code barfs on.
+    """
+    bs = list(range(ord('!'),
+                    ord('~') + 1)) + list(range(
+                        ord('¡'),
+                        ord('¬') + 1)) + list(range(ord('®'),
+                                                    ord('ÿ') + 1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length
+    strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+def basic_clean(text):
+    """Clean string."""
+    text = ftfy.fix_text(text)
+    text = html.unescape(html.unescape(text))
+    return text.strip()
+
+
+def whitespace_clean(text):
+    """Clean whitespace in string."""
+    text = re.sub(r'\s+', ' ', text)
+    text = text.strip()
+    return text
+
+
+class SimpleTokenizer:
+    """Customized Tokenizer implementation."""
+
+    def __init__(self, bpe_path: str = default_bpe()):
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        merges = gzip.open(bpe_path).read().decode('utf-8').split('\n')
+        merges = merges[1:49152 - 256 - 2 + 1]
+        merges = [tuple(merge.split()) for merge in merges]
+        vocab = list(bytes_to_unicode().values())
+        vocab = vocab + [v + '</w>' for v in vocab]
+        for merge in merges:
+            vocab.append(''.join(merge))
+        vocab.extend(['<|startoftext|>', '<|endoftext|>'])
+        self.encoder = dict(zip(vocab, range(len(vocab))))
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.bpe_ranks = dict(zip(merges, range(len(merges))))
+        self.cache = {
+            '<|startoftext|>': '<|startoftext|>',
+            '<|endoftext|>': '<|endoftext|>'
+        }
+        self.pat = re.compile(
+            r"""<\|startoftext\|>|<\|endoftext\|>|'s|'t|'re|'ve|'m|\
+                'll|'d|[\p{L}]+|[\p{N}]|[^\s\p{L}\p{N}]+""", re.IGNORECASE)
+
+    def bpe(self, token):
+        """Refer to bpe vocabulary dictionary."""
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token[:-1]) + (token[-1] + '</w>', )
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token + '</w>'
+
+        while True:
+            bigram = min(
+                pairs, key=lambda pair: self.bpe_ranks.get(pair, float('inf')))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                    new_word.extend(word[i:j])
+                    i = j
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+
+                if word[i] == first and i < len(word) - 1 and word[
+                        i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = ' '.join(word)
+        self.cache[token] = word
+        return word
+
+    def encode(self, text):
+        """Encode text strings."""
+        bpe_tokens = []
+        text = whitespace_clean(basic_clean(text)).lower()
+        for token in re.findall(self.pat, text):
+            token = ''.join(self.byte_encoder[b]
+                            for b in token.encode('utf-8'))
+            bpe_tokens.extend(self.encoder[bpe_token]
+                              for bpe_token in self.bpe(token).split(' '))
+        return bpe_tokens
+
+    def decode(self, tokens):
+        """Decoder tokens to strings."""
+        text = ''.join([self.decoder[token] for token in tokens])
+        text = bytearray([self.byte_decoder[c] for c in text]).decode(
+            'utf-8', errors='replace').replace('</w>', ' ')
+        return text
diff --git a/projects/CAT-Seg/configs/_base_/datasets/ade20k_384x384.py b/projects/CAT-Seg/configs/_base_/datasets/ade20k_384x384.py
new file mode 100644
index 0000000000..488ba3d7f6
--- /dev/null
+++ b/projects/CAT-Seg/configs/_base_/datasets/ade20k_384x384.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+crop_size = (384, 384)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/training', seg_map_path='annotations/training'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/CAT-Seg/configs/_base_/datasets/coco-stuff164k_384x384.py b/projects/CAT-Seg/configs/_base_/datasets/coco-stuff164k_384x384.py
new file mode 100644
index 0000000000..dd051761d4
--- /dev/null
+++ b/projects/CAT-Seg/configs/_base_/datasets/coco-stuff164k_384x384.py
@@ -0,0 +1,62 @@
+# dataset settings
+dataset_type = 'COCOStuffDataset'
+data_root = 'data/coco_stuff164k'
+crop_size = (384, 384)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/train2017', seg_map_path='annotations/train2017'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/val2017', seg_map_path='annotations/val2017'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/CAT-Seg/configs/_base_/datasets/pascal_context_59_384x384.py b/projects/CAT-Seg/configs/_base_/datasets/pascal_context_59_384x384.py
new file mode 100644
index 0000000000..250c5990f6
--- /dev/null
+++ b/projects/CAT-Seg/configs/_base_/datasets/pascal_context_59_384x384.py
@@ -0,0 +1,72 @@
+# dataset settings
+dataset_type = 'PascalContextDataset59'
+data_root = 'data/VOCdevkit/VOC2010/'
+
+img_scale = (520, 520)
+crop_size = (384, 384)
+
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=img_scale,
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
+        ann_file='ImageSets/SegmentationContext/train.txt',
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='JPEGImages', seg_map_path='SegmentationClassContext'),
+        ann_file='ImageSets/SegmentationContext/val.txt',
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/CAT-Seg/configs/_base_/default_runtime.py b/projects/CAT-Seg/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000..272b4d2467
--- /dev/null
+++ b/projects/CAT-Seg/configs/_base_/default_runtime.py
@@ -0,0 +1,15 @@
+default_scope = 'mmseg'
+env_cfg = dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(by_epoch=False)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+tta_model = dict(type='SegTTAModel')
diff --git a/projects/CAT-Seg/configs/_base_/schedules/schedule_80k.py b/projects/CAT-Seg/configs/_base_/schedules/schedule_80k.py
new file mode 100644
index 0000000000..0dcd6c4d1b
--- /dev/null
+++ b/projects/CAT-Seg/configs/_base_/schedules/schedule_80k.py
@@ -0,0 +1,24 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=80000,
+        by_epoch=False)
+]
+# training schedule for 80k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=8000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=8000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb1-warmcoslr2e-4-adamw-80k_ade20k-384x384.py b/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb1-warmcoslr2e-4-adamw-80k_ade20k-384x384.py
new file mode 100644
index 0000000000..bab43a6a39
--- /dev/null
+++ b/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb1-warmcoslr2e-4-adamw-80k_ade20k-384x384.py
@@ -0,0 +1,103 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py',
+    '../_base_/datasets/ade20k_384x384.py'
+]
+
+custom_imports = dict(imports=['cat_seg'])
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (384, 384)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    # due to the clip model, we do normalization in backbone forward()
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+# model_cfg
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='CLIPOVCATSeg',
+        feature_extractor=dict(
+            type='ResNet',
+            depth=101,
+            # only use the first three layers
+            num_stages=3,
+            out_indices=(0, 1, 2),
+            dilations=(1, 1, 1),
+            strides=(1, 2, 2),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet101'),
+        ),
+        train_class_json='data/ade150.json',
+        test_class_json='data/ade150.json',
+        clip_pretrained='ViT-B/16',
+        clip_finetune='attention',
+    ),
+    neck=dict(
+        type='CATSegAggregator',
+        appearance_guidance_dim=1024,
+        num_layers=2,
+        pooling_size=(1, 1),
+    ),
+    decode_head=dict(
+        type='CATSegHead',
+        in_channels=128,
+        channels=128,
+        num_classes=150,
+        embed_dims=128,
+        decoder_dims=(64, 32),
+        decoder_guidance_dims=(512, 256),
+        decoder_guidance_proj_dims=(32, 16),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            avg_non_ignore=True)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', stride=crop_size, crop_size=crop_size))
+
+# dataset settings
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+)
+
+# training schedule for 80k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=4000)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
+    visualization=dict(type='SegVisualizationHook', draw=True, interval=4000))
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0002, betas=(0.9, 0.999), weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone.feature_extractor': dict(lr_mult=0.01),
+            'backbone.clip_model.visual': dict(lr_mult=0.01)
+        }))
+
+# learning policy
+param_scheduler = [
+    # Use a linear warm-up at [0, 100) iterations
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=500),
+    # Use a cosine learning rate at [100, 900) iterations
+    dict(
+        type='CosineAnnealingLR',
+        T_max=79500,
+        by_epoch=False,
+        begin=500,
+        end=80000),
+]
diff --git a/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb1-warmcoslr2e-4-adamw-80k_pascal-context-59-384x384.py b/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb1-warmcoslr2e-4-adamw-80k_pascal-context-59-384x384.py
new file mode 100644
index 0000000000..8b412cb86f
--- /dev/null
+++ b/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb1-warmcoslr2e-4-adamw-80k_pascal-context-59-384x384.py
@@ -0,0 +1,103 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py',
+    '../_base_/datasets/pascal_context_59_384x384.py'
+]
+
+custom_imports = dict(imports=['cat_seg'])
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (384, 384)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    # due to the clip model, we do normalization in backbone forward()
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+# model_cfg
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='CLIPOVCATSeg',
+        feature_extractor=dict(
+            type='ResNet',
+            depth=101,
+            # only use the first three layers
+            num_stages=3,
+            out_indices=(0, 1, 2),
+            dilations=(1, 1, 1),
+            strides=(1, 2, 2),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet101'),
+        ),
+        train_class_json='data/pc59.json',
+        test_class_json='data/pc59.json',
+        clip_pretrained='ViT-B/16',
+        clip_finetune='attention',
+    ),
+    neck=dict(
+        type='CATSegAggregator',
+        appearance_guidance_dim=1024,
+        num_layers=2,
+        pooling_size=(1, 1),
+    ),
+    decode_head=dict(
+        type='CATSegHead',
+        in_channels=128,
+        channels=128,
+        num_classes=59,
+        embed_dims=128,
+        decoder_dims=(64, 32),
+        decoder_guidance_dims=(512, 256),
+        decoder_guidance_proj_dims=(32, 16),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            avg_non_ignore=True)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', stride=crop_size, crop_size=crop_size))
+
+# dataset settings
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+)
+
+# training schedule for 80k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=4000)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
+    visualization=dict(type='SegVisualizationHook', draw=True, interval=4000))
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0002, betas=(0.9, 0.999), weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone.feature_extractor': dict(lr_mult=0.01),
+            'backbone.clip_model.visual': dict(lr_mult=0.01)
+        }))
+
+# learning policy
+param_scheduler = [
+    # Use a linear warm-up at [0, 100) iterations
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=500),
+    # Use a cosine learning rate at [100, 900) iterations
+    dict(
+        type='CosineAnnealingLR',
+        T_max=79500,
+        by_epoch=False,
+        begin=500,
+        end=80000),
+]
diff --git a/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb2-warmcoslr2e-4-adamw-80k_coco-stuff164k-384x384.py b/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb2-warmcoslr2e-4-adamw-80k_coco-stuff164k-384x384.py
new file mode 100644
index 0000000000..52bf712fea
--- /dev/null
+++ b/projects/CAT-Seg/configs/cat_seg/catseg_vitb-r101_4xb2-warmcoslr2e-4-adamw-80k_coco-stuff164k-384x384.py
@@ -0,0 +1,102 @@
+_base_ = [
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py',
+    '../_base_/datasets/coco-stuff164k_384x384.py'
+]
+
+custom_imports = dict(imports=['cat_seg'])
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+crop_size = (384, 384)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    size=crop_size,
+    # due to the clip model, we do normalization in backbone forward()
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+# model_cfg
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='CLIPOVCATSeg',
+        feature_extractor=dict(
+            type='ResNet',
+            depth=101,
+            # only use the first three layers
+            num_stages=3,
+            out_indices=(0, 1, 2),
+            dilations=(1, 1, 1),
+            strides=(1, 2, 2),
+            norm_cfg=norm_cfg,
+            norm_eval=False,
+            style='pytorch',
+            contract_dilation=True,
+            init_cfg=dict(
+                type='Pretrained', checkpoint='torchvision://resnet101'),
+        ),
+        train_class_json='data/coco.json',
+        test_class_json='data/coco.json',
+        clip_pretrained='ViT-B/16',
+        clip_finetune='attention',
+    ),
+    neck=dict(
+        type='CATSegAggregator',
+        appearance_guidance_dim=1024,
+        num_layers=2,
+    ),
+    decode_head=dict(
+        type='CATSegHead',
+        in_channels=128,
+        channels=128,
+        num_classes=171,
+        embed_dims=128,
+        decoder_dims=(64, 32),
+        decoder_guidance_dims=(512, 256),
+        decoder_guidance_proj_dims=(32, 16),
+        loss_decode=dict(
+            type='CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=1.0,
+            avg_non_ignore=True)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='slide', stride=crop_size, crop_size=crop_size))
+
+# dataset settings
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+)
+
+# training schedule for 80k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=4000)
+
+default_hooks = dict(
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=4000),
+    visualization=dict(type='SegVisualizationHook', draw=True, interval=4000))
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0002, betas=(0.9, 0.999), weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone.feature_extractor': dict(lr_mult=0.01),
+            'backbone.clip_model.visual': dict(lr_mult=0.01)
+        }))
+
+# learning policy
+param_scheduler = [
+    # Use a linear warm-up at [0, 100) iterations
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=500),
+    # Use a cosine learning rate at [100, 900) iterations
+    dict(
+        type='CosineAnnealingLR',
+        T_max=79500,
+        by_epoch=False,
+        begin=500,
+        end=80000),
+]
diff --git a/projects/CAT-Seg/configs/cat_seg/catseg_vitg-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py b/projects/CAT-Seg/configs/cat_seg/catseg_vitg-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py
new file mode 100644
index 0000000000..345945d028
--- /dev/null
+++ b/projects/CAT-Seg/configs/cat_seg/catseg_vitg-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py
@@ -0,0 +1,11 @@
+_base_ = './catseg_vitl-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py'  # noqa
+
+model = dict(
+    backbone=dict(
+        type='CLIPOVCATSeg',
+        clip_pretrained='ViT-G',
+        custom_clip_weights='~/CLIP-ViT-bigG-14-laion2B-39B-b160k'),
+    neck=dict(
+        text_guidance_dim=1280,
+        appearance_guidance_dim=512,
+    ))
diff --git a/projects/CAT-Seg/configs/cat_seg/catseg_vith-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py b/projects/CAT-Seg/configs/cat_seg/catseg_vith-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py
new file mode 100644
index 0000000000..2f09b8c9ca
--- /dev/null
+++ b/projects/CAT-Seg/configs/cat_seg/catseg_vith-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py
@@ -0,0 +1,11 @@
+_base_ = './catseg_vitl-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py'  # noqa
+
+model = dict(
+    backbone=dict(
+        type='CLIPOVCATSeg',
+        clip_pretrained='ViT-H',
+        custom_clip_weights='~/CLIP-ViT-H-14-laion2B-s32B-b79K'),
+    neck=dict(
+        text_guidance_dim=1024,
+        appearance_guidance_dim=512,
+    ))
diff --git a/projects/CAT-Seg/configs/cat_seg/catseg_vitl-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py b/projects/CAT-Seg/configs/cat_seg/catseg_vitl-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py
new file mode 100644
index 0000000000..bb4d57ae21
--- /dev/null
+++ b/projects/CAT-Seg/configs/cat_seg/catseg_vitl-swin-b_4xb1-warmcoslr2e-4_adamw-80k_coco-stuff164k_384x384.py
@@ -0,0 +1,72 @@
+_base_ = './catseg_vitb-r101_4xb2-warmcoslr2e-4-adamw-80k_coco-stuff164k-384x384.py'  # noqa
+
+pretrained = 'https://download.openmmlab.com/mmsegmentation/v0.5/pretrain/swin/swin_base_patch4_window12_384_20220317-55b0104a.pth'  # noqa
+crop_size = (384, 384)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    backbone=dict(
+        type='CLIPOVCATSeg',
+        feature_extractor=dict(
+            _delete_=True,
+            type='SwinTransformer',
+            pretrain_img_size=384,
+            embed_dims=128,
+            depths=[2, 2, 18],
+            num_heads=[4, 8, 16],
+            window_size=12,
+            mlp_ratio=4,
+            qkv_bias=True,
+            qk_scale=None,
+            drop_rate=0.,
+            attn_drop_rate=0.,
+            drop_path_rate=0.3,
+            patch_norm=True,
+            out_indices=(0, 1, 2),
+            init_cfg=dict(type='Pretrained', checkpoint=pretrained)),
+        clip_pretrained='ViT-L/14@336px',
+    ),
+    neck=dict(
+        text_guidance_dim=768,
+        appearance_guidance_dim=512,
+    ),
+    decode_head=dict(
+        embed_dims=128,
+        decoder_guidance_dims=(256, 128),
+    ))
+
+# dataset settings
+train_dataloader = dict(
+    batch_size=1,
+    num_workers=2,
+)
+
+# training schedule for 80k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=4000)
+
+default_hooks = dict(
+    visualization=dict(type='SegVisualizationHook', draw=True, interval=4000))
+
+# optimizer
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.0002, betas=(0.9, 0.999), weight_decay=0.0001),
+    paramwise_cfg=dict(
+        custom_keys={
+            'backbone.feature_extractor': dict(lr_mult=0.01),
+            'backbone.clip_model.visual': dict(lr_mult=0.01)
+        }))
+
+# learning policy
+param_scheduler = [
+    # Use a linear warm-up at [0, 100) iterations
+    dict(type='LinearLR', start_factor=0.01, by_epoch=False, begin=0, end=500),
+    # Use a cosine learning rate at [100, 900) iterations
+    dict(
+        type='CosineAnnealingLR',
+        T_max=79500,
+        by_epoch=False,
+        begin=500,
+        end=80000),
+]
diff --git a/projects/CAT-Seg/utils/__init__.py b/projects/CAT-Seg/utils/__init__.py
new file mode 100644
index 0000000000..02d85f29cb
--- /dev/null
+++ b/projects/CAT-Seg/utils/__init__.py
@@ -0,0 +1,7 @@
+from .clip_templates import (IMAGENET_TEMPLATES, IMAGENET_TEMPLATES_SELECT,
+                             IMAGENET_TEMPLATES_SELECT_CLIP, ViLD_templates)
+
+__all__ = [
+    'IMAGENET_TEMPLATES', 'IMAGENET_TEMPLATES_SELECT',
+    'IMAGENET_TEMPLATES_SELECT_CLIP', 'ViLD_templates'
+]
diff --git a/projects/README.md b/projects/README.md
new file mode 100644
index 0000000000..5482c479aa
--- /dev/null
+++ b/projects/README.md
@@ -0,0 +1,19 @@
+# Projects
+
+The OpenMMLab ecosystem can only grow through the contributions of the community.
+Everyone is welcome to post their implementation of any great ideas in this folder! If you wish to start your own project, please go through the [example project](example_project/) for the best practice. For common questions about projects, please read our [faq](faq.md).
+
+## External Projects
+
+There are also selected external projects released in the community that use MMSegmentation:
+
+- [SegNeXt: Rethinking Convolutional Attention Design for Semantic Segmentation](https://github.com/visual-attention-network/segnext)
+- [Vision Transformer Adapter for Dense Predictions](https://github.com/czczup/ViT-Adapter)
+- [UniFormer: Unifying Convolution and Self-attention for Visual Recognition](https://github.com/Sense-X/UniFormer)
+- [Multi-Scale High-Resolution Vision Transformer for Semantic Segmentation](https://github.com/facebookresearch/HRViT)
+- [ViTAE: Vision Transformer Advanced by Exploring Intrinsic Inductive Bias](https://github.com/ViTAE-Transformer/ViTAE-Transformer)
+- [DAFormer: Improving Network Architectures and Training Strategies for Domain-Adaptive Semantic Segmentation](https://github.com/lhoyer/DAFormer)
+- [MPViT : Multi-Path Vision Transformer for Dense Prediction](https://github.com/youngwanLEE/MPViT)
+- [TopFormer: Token Pyramid Transformer for Mobile Semantic Segmentation](https://github.com/hustvl/TopFormer)
+
+Note: These projects are supported and maintained by their own contributors. The core maintainers of MMSegmentation only ensure the results are reproducible and the code quality meets its claim at the time each project was submitted, but they may not be responsible for future maintenance.
diff --git a/projects/XDecoder/README.md b/projects/XDecoder/README.md
new file mode 100644
index 0000000000..3d55575c6b
--- /dev/null
+++ b/projects/XDecoder/README.md
@@ -0,0 +1,17 @@
+# X-Decoder
+
+> [X-Decoder: Generalized Decoding for Pixel, Image, and Language](https://arxiv.org/pdf/2212.11270.pdf)
+
+<!-- [ALGORITHM] -->
+
+## Abstract
+
+We present X-Decoder, a generalized decoding model that can predict pixel-level segmentation and language tokens seamlessly. X-Decodert takes as input two types of queries: (i) generic non-semantic queries and (ii) semantic queries induced from text inputs, to decode different pixel-level and token-level outputs in the same semantic space. With such a novel design, X-Decoder is the first work that provides a unified way to support all types of image segmentation and a variety of vision-language (VL) tasks. Further, our design enables seamless interactions across tasks at different granularities and brings mutual benefits by learning a common and rich pixel-level visual-semantic understanding space, without any pseudo-labeling. After pretraining on a mixed set of a limited amount of segmentation data and millions of image-text pairs, X-Decoder exhibits strong transferability to a wide range of downstream tasks in both zero-shot and finetuning settings. Notably, it achieves (1) state-of-the-art results on open-vocabulary segmentation and referring segmentation on eight datasets; (2) better or competitive finetuned performance to other generalist and specialist models on segmentation and VL tasks; and (3) flexibility for efficient finetuning and novel task composition (e.g., referring captioning and image editing).
+
+<div align=center>
+<img src="https://github.com/open-mmlab/mmdetection/assets/17425982/cb126615-9402-4c19-8ea9-133722d7519c" width="70%"/>
+</div>
+
+## Usage
+
+We implement it based on [mmdetection](https://github.com/open-mmlab/mmdetection/), please refer to [mmdetection/projects/XDecoder](https://github.com/open-mmlab/mmdetection/tree/main/projects/XDecoder) for more details.
diff --git a/projects/bdd100k_dataset/README.md b/projects/bdd100k_dataset/README.md
new file mode 100644
index 0000000000..c774525844
--- /dev/null
+++ b/projects/bdd100k_dataset/README.md
@@ -0,0 +1,50 @@
+# BDD100K Dataset
+
+Support **`BDD100K Dataset`**
+
+## Description
+
+Author: CastleDream
+
+This project implements **`BDD100K Dataset`**
+
+### Dataset preparing
+
+Preparing `BDD100K Dataset` dataset following [BDD100K Dataset Preparing Guide](https://github.com/open-mmlab/mmsegmentation/tree/main/projects/mapillary_dataset/docs/en/user_guides/2_dataset_prepare.md#bdd100k)
+
+```none
+mmsegmentation/data
+└── bdd100k
+    ├── images
+    │   └── 10k
+    │       ├── test [2000 entries exceeds filelimit, not opening dir]
+    │       ├── train [7000 entries exceeds filelimit, not opening dir]
+    │       └── val [1000 entries exceeds filelimit, not opening dir]
+    └── labels
+        └── sem_seg
+            ├── colormaps
+            │   ├── train [7000 entries exceeds filelimit, not opening dir]
+            │   └── val [1000 entries exceeds filelimit, not opening dir]
+            ├── masks
+            │   ├── train [7000 entries exceeds filelimit, not opening dir]
+            │   └── val [1000 entries exceeds filelimit, not opening dir]
+            ├── polygons
+            │   ├── sem_seg_train.json
+            │   └── sem_seg_val.json
+            └── rles
+                ├── sem_seg_train.json
+                └── sem_seg_val.json
+```
+
+### Training commands
+
+```bash
+%cd mmsegmentation
+!python tools/train.py projects/bdd100k_dataset/configs/pspnet_r50-d8_4xb2-80k_bdd100k-512x1024.py\
+--work-dir your_work_dir
+```
+
+## Thanks
+
+- [\[Datasets\] Add Mapillary Vistas Datasets to MMSeg Core Package. #2576](https://github.com/open-mmlab/mmsegmentation/pull/2576/files)
+- [\[Feature\] Support CIHP dataset #1493](https://github.com/open-mmlab/mmsegmentation/pull/1493/files)
diff --git a/projects/bdd100k_dataset/configs/_base_/datasets/bdd100k.py b/projects/bdd100k_dataset/configs/_base_/datasets/bdd100k.py
new file mode 100644
index 0000000000..24cec69bfe
--- /dev/null
+++ b/projects/bdd100k_dataset/configs/_base_/datasets/bdd100k.py
@@ -0,0 +1,70 @@
+# dataset settings
+dataset_type = 'BDD100KDataset'
+data_root = 'data/bdd100k/'
+
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/10k/train',
+            seg_map_path='labels/sem_seg/masks/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/10k/val',
+            seg_map_path='labels/sem_seg/masks/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/bdd100k_dataset/configs/pspnet_r50-d8_4xb2-80k_bdd100k-512x1024.py b/projects/bdd100k_dataset/configs/pspnet_r50-d8_4xb2-80k_bdd100k-512x1024.py
new file mode 100644
index 0000000000..456d4c7983
--- /dev/null
+++ b/projects/bdd100k_dataset/configs/pspnet_r50-d8_4xb2-80k_bdd100k-512x1024.py
@@ -0,0 +1,11 @@
+_base_ = [
+    '../../../configs/_base_/models/pspnet_r50-d8.py',
+    './_base_/datasets/bdd100k.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_80k.py'
+]
+custom_imports = dict(
+    imports=['projects.bdd100k_dataset.mmseg.datasets.bdd100k'])
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(data_preprocessor=data_preprocessor)
diff --git a/projects/bdd100k_dataset/docs/en/user_guides/2_dataset_prepare.md b/projects/bdd100k_dataset/docs/en/user_guides/2_dataset_prepare.md
new file mode 100644
index 0000000000..f2383cfcac
--- /dev/null
+++ b/projects/bdd100k_dataset/docs/en/user_guides/2_dataset_prepare.md
@@ -0,0 +1,40 @@
+## BDD100K
+
+- You could download BDD100k datasets from  [here](https://bdd-data.berkeley.edu/) after  registration.
+
+- You can download images and masks by clicking  `10K Images` button and `Segmentation` button.
+
+- After download, unzip by the following instructions:
+
+  ```bash
+  unzip ~/bdd100k_images_10k.zip -d ~/mmsegmentation/data/
+  unzip ~/bdd100k_sem_seg_labels_trainval.zip -d ~/mmsegmentation/data/
+  ```
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── bdd100k
+│   │   ├── images
+│   │   │   └── 10k
+|   │   │   │   ├── test
+|   │   │   │   ├── train
+|   │   │   │   └── val
+│   │   └── labels
+│   │   │   └── sem_seg
+|   │   │   │   ├── colormaps
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── masks
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── polygons
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+|   │   │   │   └── rles
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+```
diff --git a/projects/bdd100k_dataset/docs/zh_cn/user_guides/2_dataset_prepare.md b/projects/bdd100k_dataset/docs/zh_cn/user_guides/2_dataset_prepare.md
new file mode 100644
index 0000000000..64fb763db4
--- /dev/null
+++ b/projects/bdd100k_dataset/docs/zh_cn/user_guides/2_dataset_prepare.md
@@ -0,0 +1,42 @@
+## BDD100K
+
+- 可以从[官方网站](https://bdd-data.berkeley.edu/) 下载 BDD100K数据集（语义分割任务主要是10K数据集），按照官网要求注册并登陆后，数据可以在[这里](https://bdd-data.berkeley.edu/portal.html#download)找到。
+
+- 图像数据对应的名称是是`10K Images`, 语义分割标注对应的名称是`Segmentation`
+
+- 下载后，可以使用以下代码进行解压
+
+  ```bash
+  unzip ~/bdd100k_images_10k.zip -d ~/mmsegmentation/data/
+  unzip ~/bdd100k_sem_seg_labels_trainval.zip -d ~/mmsegmentation/data/
+  ```
+
+就可以得到以下文件结构了：
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── bdd100k
+│   │   ├── images
+│   │   │   └── 10k
+|   │   │   │   ├── test
+|   │   │   │   ├── train
+|   │   │   │   └── val
+│   │   └── labels
+│   │   │   └── sem_seg
+|   │   │   │   ├── colormaps
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── masks
+|   │   │   │   │   ├──train
+|   │   │   │   │   └──val
+|   │   │   │   ├── polygons
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+|   │   │   │   └── rles
+|   │   │   │   │   ├──sem_seg_train.json
+|   │   │   │   │   └──sem_seg_val.json
+```
diff --git a/projects/bdd100k_dataset/mmseg/datasets/bdd100k.py b/projects/bdd100k_dataset/mmseg/datasets/bdd100k.py
new file mode 100644
index 0000000000..e536de7461
--- /dev/null
+++ b/projects/bdd100k_dataset/mmseg/datasets/bdd100k.py
@@ -0,0 +1,31 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+
+from mmseg.datasets.basesegdataset import BaseSegDataset
+
+# from mmseg.registry import DATASETS
+# @DATASETS.register_module()
+
+
+class BDD100KDataset(BaseSegDataset):
+    METAINFO = dict(
+        classes=('road', 'sidewalk', 'building', 'wall', 'fence', 'pole',
+                 'traffic light', 'traffic sign', 'vegetation', 'terrain',
+                 'sky', 'person', 'rider', 'car', 'truck', 'bus', 'train',
+                 'motorcycle', 'bicycle'),
+        palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70], [102, 102, 156],
+                 [190, 153, 153], [153, 153, 153], [250, 170,
+                                                    30], [220, 220, 0],
+                 [107, 142, 35], [152, 251, 152], [70, 130, 180],
+                 [220, 20, 60], [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                 [0, 60, 100], [0, 80, 100], [0, 0, 230], [119, 11, 32]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/example_project/README.md b/projects/example_project/README.md
new file mode 100644
index 0000000000..e4fd03cf4a
--- /dev/null
+++ b/projects/example_project/README.md
@@ -0,0 +1,134 @@
+# Dummy ResNet Wrapper
+
+> A README.md template for releasing a project.
+>
+> All the fields in this README are **mandatory** for others to understand what you have achieved in this implementation.
+> Please read our [Projects FAQ](../faq.md) if you still feel unclear about the requirements, or raise an [issue](https://github.com/open-mmlab/mmsegmentation/issues) to us!
+
+## Description
+
+> Share any information you would like others to know. For example:
+>
+> Author: @xxx.
+>
+> This is an implementation of \[XXX\].
+
+Author： @xxx.
+
+This project implements a dummy ResNet wrapper, which literally does nothing new but prints "hello world" during initialization.
+
+## Usage
+
+> For a typical model, this section should contain the commands for training and testing.
+> You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`.
+
+### Prerequisites
+
+- Python 3.7
+- PyTorch 1.6 or higher
+- [MIM](https://github.com/open-mmlab/mim) v0.33 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc2 or higher
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `example_project/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Training commands
+
+```shell
+mim train mmsegmentation configs/fcn_dummy-r50-d8_4xb2-40k_cityscapes-512x1024.py --work-dir work_dirs/dummy_resnet
+```
+
+To train on multiple GPUs, e.g. 8 GPUs, run the following command:
+
+```shell
+mim train mmsegmentation configs/fcn_dummy-r50-d8_4xb2-40k_cityscapes-512x1024.py --work-dir work_dirs/dummy_resnet --launcher pytorch --gpus 8
+```
+
+### Testing commands
+
+```shell
+mim test mmsegmentation configs/fcn_dummy-r50-d8_4xb2-40k_cityscapes-512x1024.py --work-dir work_dirs/dummy_resnet --checkpoint ${CHECKPOINT_PATH}
+```
+
+> List the results as usually done in other model's README. \[Example\](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/fcn#results-and-models
+> You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                             | download                                                                                                                                                                                                                                                                                                                           |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | ------------------------------------------------------------------ | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| FCN    | R-50-D8  | 512x1024  |   40000 | 5.7      | 4.17           | 72.25 |         73.36 | [config](configs/fcn_dummy-r50-d8_4xb2-40k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608-efe53f0d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/fcn/fcn_r50-d8_512x1024_40k_cityscapes/fcn_r50-d8_512x1024_40k_cityscapes_20200604_192608.log.json) |
+
+## Citation
+
+> You may remove this section if not applicable.
+
+```bibtex
+@misc{mmseg2020,
+    title={{MMSegmentation}: OpenMMLab Semantic Segmentation Toolbox and Benchmark},
+    author={MMSegmentation Contributors},
+    howpublished = {\url{https://github.com/open-mmlab/mmsegmentation}},
+    year={2020}
+}
+```
+
+## Checklist
+
+Here is a checklist illustrating a usual development workflow of a successful project, and also serves as an overview of this project's progress.
+
+> The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+
+> OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+
+> Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+
+> A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR.
+
+- [ ] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [ ] Finish the code
+
+> The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmseg.registry.MODELS` and configurable via a config file.
+
+- [ ] Basic docstrings & proper citation
+
+> Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd)
+
+- [ ] Test-time correctness
+
+> If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone.
+
+- [ ] A full README
+
+> As this template does.
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+> If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range.
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+> Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/io.py#L9)
+
+- [ ] Unit tests
+
+> Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmsegmentation/blob/main/tests/test_utils/test_io.py#L14)
+
+- [ ] Code polishing
+
+> Refactor your code according to reviewer's comment.
+
+- [ ] Metafile.yml
+
+> It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn.yml)
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+> In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/README.md)
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/example_project/configs/fcn_dummy-r50-d8_4xb2-40k_cityscapes-512x1024.py b/projects/example_project/configs/fcn_dummy-r50-d8_4xb2-40k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..43015364eb
--- /dev/null
+++ b/projects/example_project/configs/fcn_dummy-r50-d8_4xb2-40k_cityscapes-512x1024.py
@@ -0,0 +1,8 @@
+_base_ = ['mmseg::fcn/fcn_r50-d8_4xb2-40k_cityscapes-512x1024.py']
+
+custom_imports = dict(imports=['dummy'])
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor, backbone=dict(type='DummyResNet'))
diff --git a/projects/example_project/dummy/__init__.py b/projects/example_project/dummy/__init__.py
new file mode 100644
index 0000000000..70df7896d6
--- /dev/null
+++ b/projects/example_project/dummy/__init__.py
@@ -0,0 +1,3 @@
+from .dummy_resnet import DummyResNet
+
+__all__ = ['DummyResNet']
diff --git a/projects/example_project/dummy/dummy_resnet.py b/projects/example_project/dummy/dummy_resnet.py
new file mode 100644
index 0000000000..a510eafd52
--- /dev/null
+++ b/projects/example_project/dummy/dummy_resnet.py
@@ -0,0 +1,14 @@
+from mmseg.models.backbones import ResNetV1c
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class DummyResNet(ResNetV1c):
+    """Implements a dummy ResNet wrapper for demonstration purpose.
+    Args:
+        **kwargs: All the arguments are passed to the parent class.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        print('Hello world!')
+        super().__init__(**kwargs)
diff --git a/projects/faq.md b/projects/faq.md
new file mode 100644
index 0000000000..74b292b050
--- /dev/null
+++ b/projects/faq.md
@@ -0,0 +1,19 @@
+Q1: Why set up `projects/` folder?
+
+Implementing new models and features into OpenMMLab's algorithm libraries could be troublesome due to the rigorous requirements on code quality, which could hinder the fast iteration of SOTA models and might discourage our members from sharing their latest outcomes here. And that's why we have this `projects/` folder now, where some experimental features, frameworks and models are placed, only needed to satisfy the minimum requirement on the code quality, and can be used as standalone libraries. Users are welcome to use them if they [use MMSegmentation from source](https://mmsegmentation.readthedocs.io/en/latest/get_started.html#best-practices).
+
+Q2: Why should there be a checklist for a project?
+
+This checkelist is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+
+Q3: What kind of PR will be merged?
+
+Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. That is, the very first PR of a project must have all the terms in the first milestone checked. We do not have any extra requirements on the project's following PRs, so they can be a minor bug fix or update, and do not have to achieve one milestone at once. But keep in mind that this project is only eligible to become a part of the core package upon attaining the last milestone.
+
+Q4: Compared to other models in the core packages, why do the model implementations in projects have different training/testing commands?
+
+Projects are organized independently from the core package, and therefore their modules cannot be directly imported by train.py and test.py. Each model implementation in projects should either use `mim` for training/testing as suggested in the example project or provide a custom train.py/test.py.
+
+Q5: How to debug a project with a debugger?
+
+Debugger makes our lives easier, but using it becomes a bit tricky if we have to train/test a model via `mim`. The way to circumvent that is that we can take advantage of relative path to import these modules. Assuming that we are developing a project X and the core modules are placed under `projects/X/modules`, then simply adding `custom_imports = dict(imports='projects.X.modules')` to the config allows us to debug from usual entrypoints (e.g. `tools/train.py`) from the root directory of the algorithm library. Just don't forget to remove 'projects.X' before project publishment.
diff --git a/projects/gid_dataset/configs/_base_/datasets/gid.py b/projects/gid_dataset/configs/_base_/datasets/gid.py
new file mode 100644
index 0000000000..f7218105f2
--- /dev/null
+++ b/projects/gid_dataset/configs/_base_/datasets/gid.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'GID_Dataset'  # 注册的类名
+data_root = 'data/gid/'  # 数据集根目录
+crop_size = (256, 256)  # 图像裁剪大小
+train_pipeline = [
+    dict(type='LoadImageFromFile'),  # 从文件中加载图像
+    dict(type='LoadAnnotations'),  # 从文件中加载标注
+    dict(
+        type='RandomResize',  # 随机缩放
+        scale=(512, 512),  # 缩放尺寸
+        ratio_range=(0.5, 2.0),  # 缩放比例范围
+        keep_ratio=True),  # 是否保持长宽比
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),  # 随机裁剪
+    dict(type='RandomFlip', prob=0.5),  # 随机翻转
+    dict(type='PhotoMetricDistortion'),  # 图像增强
+    dict(type='PackSegInputs')  # 打包数据
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),  # 从文件中加载图像
+    dict(type='Resize', scale=(256, 256), keep_ratio=True),  # 缩放
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),  # 从文件中加载标注
+    dict(type='PackSegInputs')  # 打包数据
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]  # 多尺度预测缩放比例
+tta_pipeline = [  # 多尺度测试
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(  # 训练数据加载器
+    batch_size=2,  # 训练时的数据批量大小
+    num_workers=4,  # 数据加载线程数
+    persistent_workers=True,  # 是否持久化线程
+    sampler=dict(type='InfiniteSampler', shuffle=True),  # 无限采样器
+    dataset=dict(
+        type=dataset_type,  # 数据集类名
+        data_root=data_root,  # 数据集根目录
+        data_prefix=dict(
+            img_path='img_dir/train',
+            seg_map_path='ann_dir/train'),  # 训练集图像和标注路径
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,  # 验证时的数据批量大小
+    num_workers=4,  # 数据加载线程数
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='img_dir/val', seg_map_path='ann_dir/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/gid_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_gid-256x256.py b/projects/gid_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_gid-256x256.py
new file mode 100644
index 0000000000..70cb6005f8
--- /dev/null
+++ b/projects/gid_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_gid-256x256.py
@@ -0,0 +1,15 @@
+_base_ = [
+    '../../../configs/_base_/models/deeplabv3plus_r50-d8.py',
+    './_base_/datasets/gid.py', '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_240k.py'
+]
+custom_imports = dict(imports=['projects.gid_dataset.mmseg.datasets.gid'])
+
+crop_size = (256, 256)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101),
+    decode_head=dict(num_classes=6),
+    auxiliary_head=dict(num_classes=6))
diff --git a/projects/gid_dataset/mmseg/datasets/gid.py b/projects/gid_dataset/mmseg/datasets/gid.py
new file mode 100644
index 0000000000..a9e8c510b4
--- /dev/null
+++ b/projects/gid_dataset/mmseg/datasets/gid.py
@@ -0,0 +1,55 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.datasets.basesegdataset import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+# 注册数据集类
+@DATASETS.register_module()
+class GID_Dataset(BaseSegDataset):
+    """Gaofen Image Dataset (GID)
+
+    Dataset paper link:
+    https://www.sciencedirect.com/science/article/pii/S0034425719303414
+    https://x-ytong.github.io/project/GID.html
+
+    GID  6 classes: others, built-up, farmland, forest, meadow, water
+
+    In this example, select 15 images from GID dataset as training set,
+    and select 5 images as validation set.
+    The selected images are listed as follows:
+
+    GF2_PMS1__L1A0000647767-MSS1
+    GF2_PMS1__L1A0001064454-MSS1
+    GF2_PMS1__L1A0001348919-MSS1
+    GF2_PMS1__L1A0001680851-MSS1
+    GF2_PMS1__L1A0001680853-MSS1
+    GF2_PMS1__L1A0001680857-MSS1
+    GF2_PMS1__L1A0001757429-MSS1
+    GF2_PMS2__L1A0000607681-MSS2
+    GF2_PMS2__L1A0000635115-MSS2
+    GF2_PMS2__L1A0000658637-MSS2
+    GF2_PMS2__L1A0001206072-MSS2
+    GF2_PMS2__L1A0001471436-MSS2
+    GF2_PMS2__L1A0001642620-MSS2
+    GF2_PMS2__L1A0001787089-MSS2
+    GF2_PMS2__L1A0001838560-MSS2
+
+    The ``img_suffix`` is fixed to '.tif' and ``seg_map_suffix`` is
+    fixed to '.tif' for GID.
+    """
+    METAINFO = dict(
+        classes=('Others', 'Built-up', 'Farmland', 'Forest', 'Meadow',
+                 'Water'),
+        palette=[[0, 0, 0], [255, 0, 0], [0, 255, 0], [0, 255, 255],
+                 [255, 255, 0], [0, 0, 255]])
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=None,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/gid_dataset/tools/dataset_converters/gid.py b/projects/gid_dataset/tools/dataset_converters/gid.py
new file mode 100644
index 0000000000..d95654aa14
--- /dev/null
+++ b/projects/gid_dataset/tools/dataset_converters/gid.py
@@ -0,0 +1,181 @@
+import argparse
+import glob
+import math
+import os
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert GID dataset to mmsegmentation format')
+    parser.add_argument('dataset_img_path', help='GID images folder path')
+    parser.add_argument('dataset_label_path', help='GID labels folder path')
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument(
+        '-o', '--out_dir', help='output path', default='data/gid')
+    parser.add_argument(
+        '--clip_size',
+        type=int,
+        help='clipped size of image after preparation',
+        default=256)
+    parser.add_argument(
+        '--stride_size',
+        type=int,
+        help='stride of clipping original images',
+        default=256)
+    args = parser.parse_args()
+    return args
+
+
+GID_COLORMAP = dict(
+    Background=(0, 0, 0),  # 0-背景-黑色
+    Building=(255, 0, 0),  # 1-建筑-红色
+    Farmland=(0, 255, 0),  # 2-农田-绿色
+    Forest=(0, 0, 255),  # 3-森林-蓝色
+    Meadow=(255, 255, 0),  # 4-草地-黄色
+    Water=(0, 0, 255)  # 5-水-蓝色
+)
+palette = list(GID_COLORMAP.values())
+classes = list(GID_COLORMAP.keys())
+
+
+# 用列表来存一个 RGB 和一个类别的对应
+def colormap2label(palette):
+    colormap2label_list = np.zeros(256**3, dtype=np.longlong)
+    for i, colormap in enumerate(palette):
+        colormap2label_list[(colormap[0] * 256 + colormap[1]) * 256 +
+                            colormap[2]] = i
+    return colormap2label_list
+
+
+# 给定那个列表，和vis_png然后生成masks_png
+def label_indices(RGB_label, colormap2label_list):
+    RGB_label = RGB_label.astype('int32')
+    idx = (RGB_label[:, :, 0] * 256 +
+           RGB_label[:, :, 1]) * 256 + RGB_label[:, :, 2]
+    return colormap2label_list[idx]
+
+
+def RGB2mask(RGB_label, colormap2label_list):
+    mask_label = label_indices(RGB_label, colormap2label_list)
+    return mask_label
+
+
+colormap2label_list = colormap2label(palette)
+
+
+def clip_big_image(image_path, clip_save_dir, args, to_label=False):
+    """Original image of GID dataset is very large, thus pre-processing of them
+    is adopted.
+
+    Given fixed clip size and stride size to generate
+    clipped image, the intersection　of width and height is determined.
+    For example, given one 6800 x 7200 original image, the clip size is
+    256 and stride size is 256, thus it would generate 29 x 27 = 783 images
+    whose size are all 256 x 256.
+    """
+
+    image = mmcv.imread(image_path, channel_order='rgb')
+    # image = mmcv.bgr2gray(image)
+
+    h, w, c = image.shape
+    clip_size = args.clip_size
+    stride_size = args.stride_size
+
+    num_rows = math.ceil((h - clip_size) / stride_size) if math.ceil(
+        (h - clip_size) /
+        stride_size) * stride_size + clip_size >= h else math.ceil(
+            (h - clip_size) / stride_size) + 1
+    num_cols = math.ceil((w - clip_size) / stride_size) if math.ceil(
+        (w - clip_size) /
+        stride_size) * stride_size + clip_size >= w else math.ceil(
+            (w - clip_size) / stride_size) + 1
+
+    x, y = np.meshgrid(np.arange(num_cols + 1), np.arange(num_rows + 1))
+    xmin = x * clip_size
+    ymin = y * clip_size
+
+    xmin = xmin.ravel()
+    ymin = ymin.ravel()
+    xmin_offset = np.where(xmin + clip_size > w, w - xmin - clip_size,
+                           np.zeros_like(xmin))
+    ymin_offset = np.where(ymin + clip_size > h, h - ymin - clip_size,
+                           np.zeros_like(ymin))
+    boxes = np.stack([
+        xmin + xmin_offset, ymin + ymin_offset,
+        np.minimum(xmin + clip_size, w),
+        np.minimum(ymin + clip_size, h)
+    ],
+                     axis=1)
+
+    if to_label:
+        image = RGB2mask(image, colormap2label_list)
+
+    for count, box in enumerate(boxes):
+        start_x, start_y, end_x, end_y = box
+        clipped_image = image[start_y:end_y,
+                              start_x:end_x] if to_label else image[
+                                  start_y:end_y, start_x:end_x, :]
+        img_name = osp.basename(image_path).replace('.tif', '')
+        img_name = img_name.replace('_label', '')
+        if count % 3 == 0:
+            mmcv.imwrite(
+                clipped_image.astype(np.uint8),
+                osp.join(
+                    clip_save_dir.replace('train', 'val'),
+                    f'{img_name}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+        else:
+            mmcv.imwrite(
+                clipped_image.astype(np.uint8),
+                osp.join(
+                    clip_save_dir,
+                    f'{img_name}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+        count += 1
+
+
+def main():
+    args = parse_args()
+    """
+    According to this paper: https://ieeexplore.ieee.org/document/9343296/
+    select 15 images contained in GID, , which cover the whole six
+    categories, to generate train set and validation set.
+
+    """
+
+    if args.out_dir is None:
+        out_dir = osp.join('data', 'gid')
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+
+    src_path_list = glob.glob(os.path.join(args.dataset_img_path, '*.tif'))
+    print(f'Find {len(src_path_list)} pictures')
+
+    prog_bar = ProgressBar(len(src_path_list))
+
+    dst_img_dir = osp.join(out_dir, 'img_dir', 'train')
+    dst_label_dir = osp.join(out_dir, 'ann_dir', 'train')
+
+    for i, img_path in enumerate(src_path_list):
+        label_path = osp.join(
+            args.dataset_label_path,
+            osp.basename(img_path.replace('.tif', '_label.tif')))
+
+        clip_big_image(img_path, dst_img_dir, args, to_label=False)
+        clip_big_image(label_path, dst_label_dir, args, to_label=True)
+        prog_bar.update()
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/gid_dataset/tools/dataset_converters/gid_select15imgFromAll.py b/projects/gid_dataset/tools/dataset_converters/gid_select15imgFromAll.py
new file mode 100644
index 0000000000..d3eeff2690
--- /dev/null
+++ b/projects/gid_dataset/tools/dataset_converters/gid_select15imgFromAll.py
@@ -0,0 +1,75 @@
+import argparse
+import os
+import shutil
+
+# select 15 images from GID dataset
+
+img_list = [
+    'GF2_PMS1__L1A0000647767-MSS1.tif', 'GF2_PMS1__L1A0001064454-MSS1.tif',
+    'GF2_PMS1__L1A0001348919-MSS1.tif', 'GF2_PMS1__L1A0001680851-MSS1.tif',
+    'GF2_PMS1__L1A0001680853-MSS1.tif', 'GF2_PMS1__L1A0001680857-MSS1.tif',
+    'GF2_PMS1__L1A0001757429-MSS1.tif', 'GF2_PMS2__L1A0000607681-MSS2.tif',
+    'GF2_PMS2__L1A0000635115-MSS2.tif', 'GF2_PMS2__L1A0000658637-MSS2.tif',
+    'GF2_PMS2__L1A0001206072-MSS2.tif', 'GF2_PMS2__L1A0001471436-MSS2.tif',
+    'GF2_PMS2__L1A0001642620-MSS2.tif', 'GF2_PMS2__L1A0001787089-MSS2.tif',
+    'GF2_PMS2__L1A0001838560-MSS2.tif'
+]
+
+labels_list = [
+    'GF2_PMS1__L1A0000647767-MSS1_label.tif',
+    'GF2_PMS1__L1A0001064454-MSS1_label.tif',
+    'GF2_PMS1__L1A0001348919-MSS1_label.tif',
+    'GF2_PMS1__L1A0001680851-MSS1_label.tif',
+    'GF2_PMS1__L1A0001680853-MSS1_label.tif',
+    'GF2_PMS1__L1A0001680857-MSS1_label.tif',
+    'GF2_PMS1__L1A0001757429-MSS1_label.tif',
+    'GF2_PMS2__L1A0000607681-MSS2_label.tif',
+    'GF2_PMS2__L1A0000635115-MSS2_label.tif',
+    'GF2_PMS2__L1A0000658637-MSS2_label.tif',
+    'GF2_PMS2__L1A0001206072-MSS2_label.tif',
+    'GF2_PMS2__L1A0001471436-MSS2_label.tif',
+    'GF2_PMS2__L1A0001642620-MSS2_label.tif',
+    'GF2_PMS2__L1A0001787089-MSS2_label.tif',
+    'GF2_PMS2__L1A0001838560-MSS2_label.tif'
+]
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='From 150 images of GID dataset to select 15 images')
+    parser.add_argument('dataset_img_dir', help='150 GID images folder path')
+    parser.add_argument('dataset_label_dir', help='150 GID labels folder path')
+
+    parser.add_argument('dest_img_dir', help='15 GID images folder path')
+    parser.add_argument('dest_label_dir', help='15 GID labels folder path')
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main():
+    """This script is used to select 15 images from GID dataset, According to
+    paper: https://ieeexplore.ieee.org/document/9343296/"""
+    args = parse_args()
+
+    img_path = args.dataset_img_dir
+    label_path = args.dataset_label_dir
+
+    dest_img_dir = args.dest_img_dir
+    dest_label_dir = args.dest_label_dir
+
+    # copy images of 'img_list' to 'desr_dir'
+    print('Copy images of img_list to desr_dir ing...')
+    for img in img_list:
+        shutil.copy(os.path.join(img_path, img), dest_img_dir)
+    print('Done!')
+
+    print('copy labels of labels_list to desr_dir ing...')
+    for label in labels_list:
+        shutil.copy(os.path.join(label_path, label), dest_label_dir)
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/gid_dataset/user_guides/2_dataset_prepare.md b/projects/gid_dataset/user_guides/2_dataset_prepare.md
new file mode 100644
index 0000000000..63bd4d46fc
--- /dev/null
+++ b/projects/gid_dataset/user_guides/2_dataset_prepare.md
@@ -0,0 +1,53 @@
+## Gaofen Image Dataset (GID)
+
+- GID 数据集可在[此处](https://x-ytong.github.io/project/GID.html)进行下载。
+- GID 数据集包含 150 张 6800x7200 的大尺寸图像，标签为 RGB 标签。
+- 根据[文献](https://ieeexplore.ieee.org/document/9343296/)，此处选择 15 张图像生成训练集和验证集，该 15 张图像包含了所有六类信息。所选的图像名称如下：
+
+```None
+  GF2_PMS1__L1A0000647767-MSS1
+  GF2_PMS1__L1A0001064454-MSS1
+  GF2_PMS1__L1A0001348919-MSS1
+  GF2_PMS1__L1A0001680851-MSS1
+  GF2_PMS1__L1A0001680853-MSS1
+  GF2_PMS1__L1A0001680857-MSS1
+  GF2_PMS1__L1A0001757429-MSS1
+  GF2_PMS2__L1A0000607681-MSS2
+  GF2_PMS2__L1A0000635115-MSS2
+  GF2_PMS2__L1A0000658637-MSS2
+  GF2_PMS2__L1A0001206072-MSS2
+  GF2_PMS2__L1A0001471436-MSS2
+  GF2_PMS2__L1A0001642620-MSS2
+  GF2_PMS2__L1A0001787089-MSS2
+  GF2_PMS2__L1A0001838560-MSS2
+```
+
+这里也提供了一个脚本来方便的筛选出15张图像，
+
+```
+python projects/gid_dataset/tools/dataset_converters/gid_select15imgFromAll.py {150 张图像的路径} {150 张标签的路径} {15 张图像的路径} {15 张标签的路径}
+```
+
+在选择出 15 张图像后，执行以下命令进行裁切及标签的转换，需要修改为您所存储 15 张图像及标签的路径。
+
+```
+python projects/gid_dataset/tools/dataset_converters/gid.py {15 张图像的路径} {15 张标签的路径}
+```
+
+完成裁切后的 GID 数据结构如下：
+
+```none
+mmsegmentation
+├── mmseg
+├── tools
+├── configs
+├── data
+│   ├── gid
+│   │   ├── ann_dir
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+│   │   ├── img_dir
+|   │   │   │   ├── train
+|   │   │   │   ├── val
+
+```
diff --git a/projects/hssn/README.md b/projects/hssn/README.md
new file mode 100644
index 0000000000..9dcbf37de0
--- /dev/null
+++ b/projects/hssn/README.md
@@ -0,0 +1,91 @@
+# HSSN
+
+## Description
+
+Author: AI-Tianlong
+
+This project implements `Deep Hierarchical Semantic Segmentation`  inference on `cityscapes` dataset
+
+## Usage
+
+### Prerequisites
+
+- Python 3.8
+- PyTorch 1.6 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+- mmcv v2.0.0rc4
+- mmengine >=0.4.0
+
+### Dataset preparing
+
+Preparing `cityscapes` dataset following this [Dataset Preparing Guide](https://github.com/open-mmlab/mmsegmentation/blob/master/docs/en/dataset_prepare.md#prepare-datasets)
+
+### Testing commands
+
+Please put [`hieraseg_deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024_20230112_125023-bc59a3d1.pth`](https://download.openmmlab.com/mmsegmentation/v0.5/hieraseg/hieraseg_deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024_20230112_125023-bc59a3d1.pth) to `mmsegmentation/checkpoints`
+
+#### Multi-GPUs Test
+
+```bash
+# --tta optional, multi-scale test, need mmengine >=0.4.0
+bash tools/dist_test.sh [configs] [model weights] [number of gpu]  --tta
+```
+
+#### Example
+
+```shell
+bash tools/dist_test.sh projects/hssn/configs/hssn/hieraseg_deeplabv3plus_r101-d8_4xb2-80l_cityscapes-512x1024.py checkpoints/hieraseg_deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024_20230112_125023-bc59a3d1.pth 2 --tta
+```
+
+## Results
+
+### Cityscapes
+
+|   Method   | Backbone | Crop Size | mIoU  | mIoU (ms+flip) |                                                                               config                                                                               |                                                                             model                                                                             |
+| :--------: | :------: | :-------: | :---: | :------------: | :----------------------------------------------------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| DeeplabV3+ | R-101-D8 | 512x1024  | 81.61 |     82.71      | [config](https://github.com/open-mmlab/mmsegmentation/tree/main/projects/HieraSeg/configs/hieraseg/hieraseg_deeplabv3plus_r101-d8_4xb2-80l_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/hieraseg/hieraseg_deeplabv3plus_r101-d8_4xb2-80k_cityscapes-512x1024_20230112_125023-bc59a3d1.pth) |
+
+<img src="https://user-images.githubusercontent.com/50650583/210488953-e3e35ade-1132-47e1-9dfd-cf12b357ae80.png" width="50%"><img src="https://user-images.githubusercontent.com/50650583/210489746-e35ee229-3234-4292-a649-a8cd85f312ad.png" width="50%">
+
+## Citation
+
+This project is modified from [qhanghu/HSSN_pytorch](https://github.com/qhanghu/HSSN_pytorch)
+
+```bibtex
+@article{li2022deep,
+  title={Deep Hierarchical Semantic Segmentation},
+  author={Li, Liulei and Zhou, Tianfei and Wang, Wenguan and Li, Jianwu and Yang, Yi},
+  journal={CVPR},
+  year={2022}
+}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/hssn/configs/_base_/datasets/cityscapes.py b/projects/hssn/configs/_base_/datasets/cityscapes.py
new file mode 100644
index 0000000000..1698e04721
--- /dev/null
+++ b/projects/hssn/configs/_base_/datasets/cityscapes.py
@@ -0,0 +1,67 @@
+# dataset settings
+dataset_type = 'CityscapesDataset'
+data_root = 'data/cityscapes/'
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='leftImg8bit/train', seg_map_path='gtFine/train'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='leftImg8bit/val', seg_map_path='gtFine/val'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/hssn/configs/_base_/default_runtime.py b/projects/hssn/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000..272b4d2467
--- /dev/null
+++ b/projects/hssn/configs/_base_/default_runtime.py
@@ -0,0 +1,15 @@
+default_scope = 'mmseg'
+env_cfg = dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(by_epoch=False)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+tta_model = dict(type='SegTTAModel')
diff --git a/projects/hssn/configs/_base_/models/deeplabv3plus_r50-d8_vd_contrast.py b/projects/hssn/configs/_base_/models/deeplabv3plus_r50-d8_vd_contrast.py
new file mode 100644
index 0000000000..a6af45ce84
--- /dev/null
+++ b/projects/hssn/configs/_base_/models/deeplabv3plus_r50-d8_vd_contrast.py
@@ -0,0 +1,55 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    backbone=dict(
+        type='ResNetV1d',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='DepthwiseSeparableASPPContrastHead',
+        in_channels=2048,
+        in_index=3,
+        channels=512,
+        dilations=(1, 12, 24, 36),
+        c1_in_channels=256,
+        c1_channels=48,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        proj='convmlp',
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/projects/hssn/configs/_base_/schedules/schedule_80k.py b/projects/hssn/configs/_base_/schedules/schedule_80k.py
new file mode 100644
index 0000000000..0dcd6c4d1b
--- /dev/null
+++ b/projects/hssn/configs/_base_/schedules/schedule_80k.py
@@ -0,0 +1,24 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=80000,
+        by_epoch=False)
+]
+# training schedule for 80k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=8000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=8000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/projects/hssn/configs/hssn/hieraseg_deeplabv3plus_r101-d8_4xb2-80l_cityscapes-512x1024.py b/projects/hssn/configs/hssn/hieraseg_deeplabv3plus_r101-d8_4xb2-80l_cityscapes-512x1024.py
new file mode 100644
index 0000000000..8f04a2d656
--- /dev/null
+++ b/projects/hssn/configs/hssn/hieraseg_deeplabv3plus_r101-d8_4xb2-80l_cityscapes-512x1024.py
@@ -0,0 +1,21 @@
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8_vd_contrast.py',
+    '../_base_/datasets/cityscapes.py', '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+
+custom_imports = dict(imports=[
+    'projects.hssn.decode_head.sep_aspp_contrast_head',
+    'projects.hssn.losses.hiera_triplet_loss_cityscape'
+])
+
+model = dict(
+    pretrained=None,
+    backbone=dict(depth=101),
+    decode_head=dict(
+        num_classes=26,
+        loss_decode=dict(
+            type='HieraTripletLossCityscape', num_classes=19,
+            loss_weight=1.0)),
+    auxiliary_head=dict(num_classes=19),
+    test_cfg=dict(mode='whole', is_hiera=True, hiera_num_classes=7))
diff --git a/projects/hssn/decode_head/__init__.py b/projects/hssn/decode_head/__init__.py
new file mode 100644
index 0000000000..da454ea339
--- /dev/null
+++ b/projects/hssn/decode_head/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .sep_aspp_contrast_head import DepthwiseSeparableASPPContrastHead
+
+__all__ = ['DepthwiseSeparableASPPContrastHead']
diff --git a/projects/hssn/decode_head/sep_aspp_contrast_head.py b/projects/hssn/decode_head/sep_aspp_contrast_head.py
new file mode 100644
index 0000000000..331af30de4
--- /dev/null
+++ b/projects/hssn/decode_head/sep_aspp_contrast_head.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List, Tuple
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import build_norm_layer
+from torch import Tensor
+
+from mmseg.models.decode_heads.sep_aspp_head import DepthwiseSeparableASPPHead
+from mmseg.models.losses import accuracy
+from mmseg.models.utils import resize
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+
+
+class ProjectionHead(nn.Module):
+    """ProjectionHead, project feature map to specific channels.
+
+    Args:
+        dim_in (int): Input channels.
+        norm_cfg (dict): config of norm layer.
+        proj_dim (int): Output channels. Default: 256.
+        proj (str): Projection type, 'linear' or 'convmlp'. Default: 'convmlp'
+    """
+
+    def __init__(self,
+                 dim_in: int,
+                 norm_cfg: dict,
+                 proj_dim: int = 256,
+                 proj: str = 'convmlp'):
+        super().__init__()
+        assert proj in ['convmlp', 'linear']
+        if proj == 'linear':
+            self.proj = nn.Conv2d(dim_in, proj_dim, kernel_size=1)
+        elif proj == 'convmlp':
+            self.proj = nn.Sequential(
+                nn.Conv2d(dim_in, dim_in, kernel_size=1),
+                build_norm_layer(norm_cfg, dim_in)[1], nn.ReLU(inplace=True),
+                nn.Conv2d(dim_in, proj_dim, kernel_size=1))
+
+    def forward(self, x):
+        return torch.nn.functional.normalize(self.proj(x), p=2, dim=1)
+
+
+@MODELS.register_module()
+class DepthwiseSeparableASPPContrastHead(DepthwiseSeparableASPPHead):
+    """Deep Hierarchical Semantic Segmentation. This head is the implementation
+    of `<https://arxiv.org/abs/2203.14335>`_.
+
+    Based on Encoder-Decoder with Atrous Separable Convolution for
+    Semantic Image Segmentation.
+    `DeepLabV3+ <https://arxiv.org/abs/1802.02611>`_.
+
+    Args:
+        proj (str): The type of ProjectionHead, 'linear' or 'convmlp',
+            default 'convmlp'
+    """
+
+    def __init__(self, proj: str = 'convmlp', **kwargs):
+        super().__init__(**kwargs)
+        self.proj_head = ProjectionHead(
+            dim_in=2048, norm_cfg=self.norm_cfg, proj=proj)
+        self.register_buffer('step', torch.zeros(1))
+
+    def forward(self, inputs) -> Tuple[Tensor]:
+        """Forward function."""
+        output = super().forward(inputs)
+
+        self.step += 1
+        embedding = self.proj_head(inputs[-1])
+
+        return output, embedding
+
+    def predict_by_feat(self, seg_logits: Tuple[Tensor],
+                        batch_img_metas: List[dict]) -> Tensor:
+        """Transform a batch of output seg_logits to the input shape.
+
+        Args:
+            seg_logits (Tensor): The output from decode head forward function.
+            batch_img_metas (list[dict]): Meta information of each image, e.g.,
+                image size, scaling factor, etc.
+
+        Returns:
+            Tensor: Outputs segmentation logits map.
+        """
+        # HSSN decode_head output is: (out, embedding): tuple
+        # only need 'out' here.
+        if isinstance(seg_logits, tuple):
+            seg_logit = seg_logits[0]
+
+        if seg_logit.size(1) == 26:  # For cityscapes dataset，19 + 7
+            hiera_num_classes = 7
+            seg_logit[:, 0:2] += seg_logit[:, -7]
+            seg_logit[:, 2:5] += seg_logit[:, -6]
+            seg_logit[:, 5:8] += seg_logit[:, -5]
+            seg_logit[:, 8:10] += seg_logit[:, -4]
+            seg_logit[:, 10:11] += seg_logit[:, -3]
+            seg_logit[:, 11:13] += seg_logit[:, -2]
+            seg_logit[:, 13:19] += seg_logit[:, -1]
+
+        elif seg_logit.size(1) == 12:  # For Pascal_person dataset, 7 + 5
+            hiera_num_classes = 5
+            seg_logit[:, 0:1] = seg_logit[:, 0:1] + \
+                seg_logit[:, 7] + seg_logit[:, 10]
+            seg_logit[:, 1:5] = seg_logit[:, 1:5] + \
+                seg_logit[:, 8] + seg_logit[:, 11]
+            seg_logit[:, 5:7] = seg_logit[:, 5:7] + \
+                seg_logit[:, 9] + seg_logit[:, 11]
+
+        elif seg_logit.size(1) == 25:  # For LIP dataset, 20 + 5
+            hiera_num_classes = 5
+            seg_logit[:, 0:1] = seg_logit[:, 0:1] + \
+                seg_logit[:, 20] + seg_logit[:, 23]
+            seg_logit[:, 1:8] = seg_logit[:, 1:8] + \
+                seg_logit[:, 21] + seg_logit[:, 24]
+            seg_logit[:, 10:12] = seg_logit[:, 10:12] + \
+                seg_logit[:, 21] + seg_logit[:, 24]
+            seg_logit[:, 13:16] = seg_logit[:, 13:16] + \
+                seg_logit[:, 21] + seg_logit[:, 24]
+            seg_logit[:, 8:10] = seg_logit[:, 8:10] + \
+                seg_logit[:, 22] + seg_logit[:, 24]
+            seg_logit[:, 12:13] = seg_logit[:, 12:13] + \
+                seg_logit[:, 22] + seg_logit[:, 24]
+            seg_logit[:, 16:20] = seg_logit[:, 16:20] + \
+                seg_logit[:, 22] + seg_logit[:, 24]
+
+        # elif seg_logit.size(1) == 144 # For Mapillary dataset, 124+16+4
+        # unofficial repository not release mapillary until 2023/2/6
+
+        if isinstance(batch_img_metas[0]['img_shape'], torch.Size):
+            # slide inference
+            size = batch_img_metas[0]['img_shape']
+        elif 'pad_shape' in batch_img_metas[0]:
+            size = batch_img_metas[0]['pad_shape'][:2]
+        else:
+            size = batch_img_metas[0]['img_shape']
+        seg_logit = seg_logit[:, :-hiera_num_classes]
+        seg_logit = resize(
+            input=seg_logit,
+            size=size,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        return seg_logit
+
+    def loss_by_feat(
+            self,
+            seg_logits: Tuple[Tensor],  # (out, embedding)
+            batch_data_samples: SampleList) -> dict:
+        """Compute segmentation loss. Will fix in future.
+
+        Args:
+            seg_logits (Tuple[Tensor]): The output from decode head
+                forward function.
+                For this decode_head output are (out, embedding): tuple
+            batch_data_samples (List[:obj:`SegDataSample`]): The seg
+                data samples. It usually includes information such
+                as `metainfo` and `gt_sem_seg`.
+
+        Returns:
+            dict[str, Tensor]: a dictionary of loss components
+        """
+        seg_logit_before = seg_logits[0]
+        embedding = seg_logits[1]
+        seg_label = self._stack_batch_gt(batch_data_samples)
+
+        loss = dict()
+        seg_logit = resize(
+            input=seg_logit_before,
+            size=seg_label.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        if self.sampler is not None:
+            seg_weight = self.sampler.sample(seg_logit, seg_label)
+        else:
+            seg_weight = None
+        seg_label = seg_label.squeeze(1)
+        seg_logit_before = resize(
+            input=seg_logit_before,
+            scale_factor=0.5,
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        loss['loss_seg'] = self.loss_decode(
+            self.step,
+            embedding,
+            seg_logit_before,
+            seg_logit,
+            seg_label,
+            weight=seg_weight,
+            ignore_index=self.ignore_index)
+        loss['acc_seg'] = accuracy(seg_logit, seg_label)
+        return loss
diff --git a/projects/hssn/losses/__init__.py b/projects/hssn/losses/__init__.py
new file mode 100644
index 0000000000..47d2686482
--- /dev/null
+++ b/projects/hssn/losses/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .hiera_triplet_loss_cityscape import HieraTripletLossCityscape
+
+__all__ = ['HieraTripletLossCityscape']
diff --git a/projects/hssn/losses/hiera_triplet_loss_cityscape.py b/projects/hssn/losses/hiera_triplet_loss_cityscape.py
new file mode 100644
index 0000000000..a784f13e62
--- /dev/null
+++ b/projects/hssn/losses/hiera_triplet_loss_cityscape.py
@@ -0,0 +1,218 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmseg.models.builder import LOSSES
+from mmseg.models.losses.cross_entropy_loss import CrossEntropyLoss
+from .tree_triplet_loss import TreeTripletLoss
+
+hiera_map = [0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 6, 6, 6, 6]
+hiera_index = [[0, 2], [2, 5], [5, 8], [8, 10], [10, 11], [11, 13], [13, 19]]
+
+hiera = {
+    'hiera_high': {
+        'flat': [0, 2],
+        'construction': [2, 5],
+        'object': [5, 8],
+        'nature': [8, 10],
+        'sky': [10, 11],
+        'human': [11, 13],
+        'vehicle': [13, 19]
+    }
+}
+
+
+def prepare_targets(targets):
+    b, h, w = targets.shape
+    targets_high = torch.ones(
+        (b, h, w), dtype=targets.dtype, device=targets.device) * 255
+    indices_high = []
+    for index, high in enumerate(hiera['hiera_high'].keys()):
+        indices = hiera['hiera_high'][high]
+        for ii in range(indices[0], indices[1]):
+            targets_high[targets == ii] = index
+        indices_high.append(indices)
+
+    return targets, targets_high, indices_high
+
+
+def losses_hiera(predictions,
+                 targets,
+                 targets_top,
+                 num_classes,
+                 indices_high,
+                 eps=1e-8):
+    """Implementation of hiera loss.
+
+    Args:
+        predictions (torch.Tensor): seg logits produced by decode head.
+        targets (torch.Tensor): The learning label of the prediction.
+        targets_top (torch.Tensor): The hierarchy ground truth of the learning
+            label.
+        num_classes (int): Number of categories.
+        indices_high (List[List[int]]): Hierarchy indices of each hierarchy.
+        eps (float):Term added to the Logarithm to improve numerical stability.
+    """
+    b, _, h, w = predictions.shape
+    predictions = torch.sigmoid(predictions.float())
+    void_indices = (targets == 255)
+    targets[void_indices] = 0
+    targets = F.one_hot(targets, num_classes=num_classes).permute(0, 3, 1, 2)
+    void_indices2 = (targets_top == 255)
+    targets_top[void_indices2] = 0
+    targets_top = F.one_hot(targets_top, num_classes=7).permute(0, 3, 1, 2)
+
+    MCMA = predictions[:, :num_classes, :, :]
+    MCMB = torch.zeros((b, 7, h, w)).to(predictions)
+    for ii in range(7):
+        MCMB[:, ii:ii + 1, :, :] = torch.max(
+            torch.cat([
+                predictions[:, indices_high[ii][0]:indices_high[ii][1], :, :],
+                predictions[:, num_classes + ii:num_classes + ii + 1, :, :]
+            ],
+                      dim=1), 1, True)[0]
+
+    MCLB = predictions[:, num_classes:num_classes + 7, :, :]
+    MCLA = predictions[:, :num_classes, :, :].clone()
+    for ii in range(7):
+        for jj in range(indices_high[ii][0], indices_high[ii][1]):
+            MCLA[:, jj:jj + 1, :, :] = torch.min(
+                torch.cat([
+                    predictions[:, jj:jj + 1, :, :], MCLB[:, ii:ii + 1, :, :]
+                ],
+                          dim=1), 1, True)[0]
+
+    valid_indices = (~void_indices).unsqueeze(1)
+    num_valid = valid_indices.sum()
+    valid_indices2 = (~void_indices2).unsqueeze(1)
+    num_valid2 = valid_indices2.sum()
+    # channel_num*sum()/one_channel_valid already has a weight
+    loss = (
+        (-targets[:, :num_classes, :, :] * torch.log(MCLA + eps) -
+         (1.0 - targets[:, :num_classes, :, :]) * torch.log(1.0 - MCMA + eps))
+        * valid_indices).sum() / num_valid / num_classes
+    loss += ((-targets_top[:, :, :, :] * torch.log(MCLB + eps) -
+              (1.0 - targets_top[:, :, :, :]) * torch.log(1.0 - MCMB + eps)) *
+             valid_indices2).sum() / num_valid2 / 7
+
+    return 5 * loss
+
+
+def losses_hiera_focal(predictions,
+                       targets,
+                       targets_top,
+                       num_classes,
+                       indices_high,
+                       eps=1e-8,
+                       gamma=2):
+    """Implementation of hiera loss.
+
+    Args:
+        predictions (torch.Tensor): seg logits produced by decode head.
+        targets (torch.Tensor): The learning label of the prediction.
+        targets_top (torch.Tensor): The hierarchy ground truth of the learning
+            label.
+        num_classes (int): Number of categories.
+        indices_high (List[List[int]]): Hierarchy indices of each hierarchy.
+        eps (float):Term added to the Logarithm to improve numerical stability.
+            Defaults: 1e-8.
+        gamma (int): The exponent value. Defaults: 2.
+    """
+    b, _, h, w = predictions.shape
+    predictions = torch.sigmoid(predictions.float())
+    void_indices = (targets == 255)
+    targets[void_indices] = 0
+    targets = F.one_hot(targets, num_classes=num_classes).permute(0, 3, 1, 2)
+    void_indices2 = (targets_top == 255)
+    targets_top[void_indices2] = 0
+    targets_top = F.one_hot(targets_top, num_classes=7).permute(0, 3, 1, 2)
+
+    MCMA = predictions[:, :num_classes, :, :]
+    MCMB = torch.zeros((b, 7, h, w),
+                       dtype=predictions.dtype,
+                       device=predictions.device)
+    for ii in range(7):
+        MCMB[:, ii:ii + 1, :, :] = torch.max(
+            torch.cat([
+                predictions[:, indices_high[ii][0]:indices_high[ii][1], :, :],
+                predictions[:, num_classes + ii:num_classes + ii + 1, :, :]
+            ],
+                      dim=1), 1, True)[0]
+
+    MCLB = predictions[:, num_classes:num_classes + 7, :, :]
+    MCLA = predictions[:, :num_classes, :, :].clone()
+    for ii in range(7):
+        for jj in range(indices_high[ii][0], indices_high[ii][1]):
+            MCLA[:, jj:jj + 1, :, :] = torch.min(
+                torch.cat([
+                    predictions[:, jj:jj + 1, :, :], MCLB[:, ii:ii + 1, :, :]
+                ],
+                          dim=1), 1, True)[0]
+
+    valid_indices = (~void_indices).unsqueeze(1)
+    num_valid = valid_indices.sum()
+    valid_indices2 = (~void_indices2).unsqueeze(1)
+    num_valid2 = valid_indices2.sum()
+    # channel_num*sum()/one_channel_valid already has a weight
+    loss = ((-targets[:, :num_classes, :, :] * torch.pow(
+        (1.0 - MCLA), gamma) * torch.log(MCLA + eps) -
+             (1.0 - targets[:, :num_classes, :, :]) * torch.pow(MCMA, gamma) *
+             torch.log(1.0 - MCMA + eps)) *
+            valid_indices).sum() / num_valid / num_classes
+    loss += (
+        (-targets_top[:, :, :, :] * torch.pow(
+            (1.0 - MCLB), gamma) * torch.log(MCLB + eps) -
+         (1.0 - targets_top[:, :, :, :]) * torch.pow(MCMB, gamma) *
+         torch.log(1.0 - MCMB + eps)) * valid_indices2).sum() / num_valid2 / 7
+
+    return 5 * loss
+
+
+@LOSSES.register_module()
+class HieraTripletLossCityscape(nn.Module):
+    """Modified from https://github.com/qhanghu/HSSN_pytorch/blob/main/mmseg/mo
+    dels/losses/hiera_triplet_loss_cityscape.py."""
+
+    def __init__(self, num_classes, use_sigmoid=False, loss_weight=1.0):
+        super().__init__()
+        self.num_classes = num_classes
+        self.loss_weight = loss_weight
+        self.treetripletloss = TreeTripletLoss(num_classes, hiera_map,
+                                               hiera_index)
+        self.ce = CrossEntropyLoss()
+
+    def forward(self,
+                step,
+                embedding,
+                cls_score_before,
+                cls_score,
+                label,
+                weight=None,
+                **kwargs):
+        targets, targets_top, indices_top = prepare_targets(label)
+
+        loss = losses_hiera(cls_score, targets, targets_top, self.num_classes,
+                            indices_top)
+        ce_loss = self.ce(cls_score[:, :-7], label)
+        ce_loss2 = self.ce(cls_score[:, -7:], targets_top)
+        loss = loss + ce_loss + ce_loss2
+
+        loss_triplet, class_count = self.treetripletloss(embedding, label)
+        class_counts = [
+            torch.ones_like(class_count)
+            for _ in range(torch.distributed.get_world_size())
+        ]
+        torch.distributed.all_gather(class_counts, class_count, async_op=False)
+        class_counts = torch.cat(class_counts, dim=0)
+
+        if torch.distributed.get_world_size() == torch.nonzero(
+                class_counts, as_tuple=False).size(0):
+            factor = 1 / 4 * (1 + torch.cos(
+                torch.tensor((step.item() - 80000) / 80000 *
+                             math.pi))) if step.item() < 80000 else 0.5
+            loss += factor * loss_triplet
+
+        return loss * self.loss_weight
diff --git a/projects/hssn/losses/tree_triplet_loss.py b/projects/hssn/losses/tree_triplet_loss.py
new file mode 100644
index 0000000000..ccc0937405
--- /dev/null
+++ b/projects/hssn/losses/tree_triplet_loss.py
@@ -0,0 +1,86 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from mmseg.models.builder import LOSSES
+
+
+@LOSSES.register_module()
+class TreeTripletLoss(nn.Module):
+    """TreeTripletLoss. Modified from https://github.com/qhanghu/HSSN_pytorch/b
+    lob/main/mmseg/models/losses/tree_triplet_loss.py.
+
+    Args:
+        num_classes (int): Number of categories.
+        hiera_map (List[int]): Hierarchy map of each category.
+        hiera_index (List[List[int]]): Hierarchy indices of each hierarchy.
+        ignore_index (int): Specifies a target value that is ignored and
+            does not contribute to the input gradients. Defaults: 255.
+
+    Examples:
+        >>> num_classes = 19
+        >>> hiera_map = [
+                0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 6, 6, 6, 6]
+        >>> hiera_index = [
+                0, 2], [2, 5], [5, 8], [8, 10], [10, 11], [11, 13], [13, 19]]
+    """
+
+    def __init__(self, num_classes, hiera_map, hiera_index, ignore_index=255):
+        super().__init__()
+
+        self.ignore_label = ignore_index
+        self.num_classes = num_classes
+        self.hiera_map = hiera_map
+        self.hiera_index = hiera_index
+
+    def forward(self, feats: torch.Tensor, labels=None, max_triplet=200):
+        labels = labels.unsqueeze(1).float().clone()
+        labels = torch.nn.functional.interpolate(
+            labels, (feats.shape[2], feats.shape[3]), mode='nearest')
+        labels = labels.squeeze(1).long()
+        assert labels.shape[-1] == feats.shape[-1], '{} {}'.format(
+            labels.shape, feats.shape)
+
+        labels = labels.view(-1)
+        feats = feats.permute(0, 2, 3, 1)
+        feats = feats.contiguous().view(-1, feats.shape[-1])
+
+        triplet_loss = 0
+        exist_classes = torch.unique(labels)
+        exist_classes = [x for x in exist_classes if x != 255]
+        class_count = 0
+
+        for ii in exist_classes:
+            index_range = self.hiera_index[self.hiera_map[ii]]
+            index_anchor = labels == ii
+            index_pos = (labels >= index_range[0]) & (
+                labels < index_range[-1]) & (~index_anchor)
+            index_neg = (labels < index_range[0]) | (labels >= index_range[-1])
+
+            min_size = min(
+                torch.sum(index_anchor), torch.sum(index_pos),
+                torch.sum(index_neg), max_triplet)
+
+            feats_anchor = feats[index_anchor][:min_size]
+            feats_pos = feats[index_pos][:min_size]
+            feats_neg = feats[index_neg][:min_size]
+
+            distance = torch.zeros(min_size, 2).to(feats)
+            distance[:, 0:1] = 1 - (feats_anchor * feats_pos).sum(1, True)
+            distance[:, 1:2] = 1 - (feats_anchor * feats_neg).sum(1, True)
+
+            # margin always 0.1 + (4-2)/4 since the hierarchy is three level
+            # TODO: should include label of pos is the same as anchor
+            margin = 0.6 * torch.ones(min_size).to(feats)
+
+            tl = distance[:, 0] - distance[:, 1] + margin
+            tl = F.relu(tl)
+
+            if tl.size(0) > 0:
+                triplet_loss += tl.mean()
+                class_count += 1
+        if class_count == 0:
+            return None, torch.tensor([0]).to(feats)
+        triplet_loss /= class_count
+        return triplet_loss, torch.tensor([class_count]).to(feats)
diff --git a/projects/isnet/README.md b/projects/isnet/README.md
new file mode 100644
index 0000000000..0a79ad6a4f
--- /dev/null
+++ b/projects/isnet/README.md
@@ -0,0 +1,117 @@
+# ISNet
+
+[ISNet: Integrate Image-Level and Semantic-Level Context for Semantic Segmentation](https://arxiv.org/pdf/2108.12382.pdf)
+
+## Description
+
+This is an implementation of [ISNet](https://arxiv.org/pdf/2108.12382.pdf).
+[Official Repo](https://github.com/SegmentationBLWX/sssegmentation)
+
+## Usage
+
+### Prerequisites
+
+- Python 3.7
+- PyTorch 1.6 or higher
+- [MIM](https://github.com/open-mmlab/mim) v0.33 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc2 or higher
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `isnet/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Training commands
+
+```shell
+mim train mmsegmentation configs/isnet_r50-d8_8xb2-160k_cityscapes-512x1024.py --work-dir work_dirs/isnet
+```
+
+To train on multiple GPUs, e.g. 8 GPUs, run the following command:
+
+```shell
+mim train mmsegmentation configs/isnet_r50-d8_8xb2-160k_cityscapes-512x1024.py --work-dir work_dirs/isnet --launcher pytorch --gpus 8
+```
+
+### Testing commands
+
+```shell
+mim test mmsegmentation configs/isnet_r50-d8_8xb2-160k_cityscapes-512x1024.py --work-dir work_dirs/isnet --checkpoint ${CHECKPOINT_PATH}
+```
+
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                          | download                                                                                                                 |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------: | --------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------ |
+| ISNet  | R-50-D8  | 512x1024  |       - | -        | -              | 79.32 |         80.88 | [config](configs/isnet_r50-d8_8xb2-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/isnet/isnet_r50-d8_cityscapes-512x1024_20230104-a7a8ccf2.pth) |
+
+## Citation
+
+```bibtex
+@article{Jin2021ISNetII,
+  title={ISNet: Integrate Image-Level and Semantic-Level Context for Semantic Segmentation},
+  author={Zhenchao Jin and B. Liu and Qi Chu and Nenghai Yu},
+  journal={2021 IEEE/CVF International Conference on Computer Vision (ICCV)},
+  year={2021},
+  pages={7169-7178}
+}
+```
+
+## Checklist
+
+The progress of ISNet.
+
+<!-- The PIC (person in charge) or contributors of this project should check all the items that they believe have been finished, which will further be verified by codebase maintainers via a PR.
+
+OpenMMLab's maintainer will review the code to ensure the project's quality. Reaching the first milestone means that this project suffices the minimum requirement of being merged into 'projects/'. But this project is only eligible to become a part of the core package upon attaining the last milestone.
+
+Note that keeping this section up-to-date is crucial not only for this project's developers but the entire community, since there might be some other contributors joining this project and deciding their starting point from this list. It also helps maintainers accurately estimate time and effort on further code polishing, if needed.
+
+A project does not necessarily have to be finished in a single PR, but it's essential for the project to at least reach the first milestone in its very first PR. -->
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  <!-- The code's design shall follow existing interfaces and convention. For example, each model component should be registered into `mmseg.registry.MODELS` and configurable via a config file. -->
+
+  - [x] Basic docstrings & proper citation
+
+  <!-- Each major object should contain a docstring, describing its functionality and arguments. If you have adapted the code from other open-source projects, don't forget to cite the source project in docstring and make sure your behavior is not against its license. Typically, we do not accept any code snippet under GPL license. [A Short Guide to Open Source Licenses](https://medium.com/nationwide-technology/a-short-guide-to-open-source-licenses-cf5b1c329edd) -->
+
+  - [x] Test-time correctness
+
+  <!-- If you are reproducing the result from a paper, make sure your model's inference-time performance matches that in the original paper. The weights usually could be obtained by simply renaming the keys in the official pre-trained weights. This test could be skipped though, if you are able to prove the training-time correctness and check the second milestone. -->
+
+  - [x] A full README
+
+  <!-- As this template does. -->
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+  <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  <!-- Ideally *all* the methods should have [type hints](https://www.pythontutorial.net/python-basics/python-type-hints/) and [docstrings](https://google.github.io/styleguide/pyguide.html#381-docstrings). [Example](https://github.com/open-mmlab/mmsegmentation/blob/main/mmseg/utils/io.py#L9) -->
+
+  - [ ] Unit tests
+
+  <!-- Unit tests for each module are required. [Example](https://github.com/open-mmlab/mmsegmentation/blob/main/tests/test_utils/test_io.py#L14) -->
+
+  - [ ] Code polishing
+
+  <!-- Refactor your code according to reviewer's comment. -->
+
+  - [ ] Metafile.yml
+
+  <!-- It will be parsed by MIM and Inferencer. [Example](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/fcn.yml) -->
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+  <!-- In particular, you may have to refactor this README into a standard one. [Example](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/fcn/README.md) -->
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/isnet/configs/isnet_r50-d8_8xb2-160k_cityscapes-512x1024.py b/projects/isnet/configs/isnet_r50-d8_8xb2-160k_cityscapes-512x1024.py
new file mode 100644
index 0000000000..a00d39237d
--- /dev/null
+++ b/projects/isnet/configs/isnet_r50-d8_8xb2-160k_cityscapes-512x1024.py
@@ -0,0 +1,80 @@
+_base_ = [
+    '../../../configs/_base_/datasets/cityscapes.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_80k.py'
+]
+
+data_root = '../../data/cityscapes/'
+train_dataloader = dict(dataset=dict(data_root=data_root))
+val_dataloader = dict(dataset=dict(data_root=data_root))
+test_dataloader = dict(dataset=dict(data_root=data_root))
+
+custom_imports = dict(imports=['projects.isnet.decode_heads'])
+
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='ResNetV1c',
+        depth=50,
+        num_stages=4,
+        out_indices=(0, 1, 2, 3),
+        dilations=(1, 1, 2, 4),
+        strides=(1, 2, 1, 1),
+        norm_cfg=norm_cfg,
+        norm_eval=False,
+        style='pytorch',
+        contract_dilation=True),
+    decode_head=dict(
+        type='ISNetHead',
+        in_channels=(256, 512, 1024, 2048),
+        input_transform='multiple_select',
+        in_index=(0, 1, 2, 3),
+        channels=512,
+        dropout_ratio=0.1,
+        transform_channels=256,
+        concat_input=True,
+        with_shortcut=False,
+        shortcut_in_channels=256,
+        shortcut_feat_channels=48,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=[
+            dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=1.0,
+                loss_name='loss_o'),
+            dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                loss_weight=0.4,
+                loss_name='loss_d'),
+        ]),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=1024,
+        in_index=2,
+        channels=512,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    train_cfg=dict(),
+    # test_cfg=dict(mode='slide', crop_size=(769, 769), stride=(513, 513))
+    test_cfg=dict(mode='whole'))
diff --git a/projects/isnet/decode_heads/__init__.py b/projects/isnet/decode_heads/__init__.py
new file mode 100644
index 0000000000..a451629c4c
--- /dev/null
+++ b/projects/isnet/decode_heads/__init__.py
@@ -0,0 +1,3 @@
+from .isnet_head import ISNetHead
+
+__all__ = ['ISNetHead']
diff --git a/projects/isnet/decode_heads/isnet_head.py b/projects/isnet/decode_heads/isnet_head.py
new file mode 100644
index 0000000000..9c8df540ee
--- /dev/null
+++ b/projects/isnet/decode_heads/isnet_head.py
@@ -0,0 +1,337 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+from mmcv.cnn import ConvModule
+from torch import Tensor
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.models.losses import accuracy
+from mmseg.models.utils import SelfAttentionBlock, resize
+from mmseg.registry import MODELS
+from mmseg.utils import SampleList
+
+
+class ImageLevelContext(nn.Module):
+    """ Image-Level Context Module
+    Args:
+        feats_channels (int): Input channels of query/key feature.
+        transform_channels (int): Output channels of key/query transform.
+        concat_input (bool): whether to concat input feature.
+        align_corners (bool): align_corners argument of F.interpolate.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self,
+                 feats_channels,
+                 transform_channels,
+                 concat_input=False,
+                 align_corners=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None):
+        super().__init__()
+        self.align_corners = align_corners
+        self.global_avgpool = nn.AdaptiveAvgPool2d((1, 1))
+        self.correlate_net = SelfAttentionBlock(
+            key_in_channels=feats_channels * 2,
+            query_in_channels=feats_channels,
+            channels=transform_channels,
+            out_channels=feats_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=2,
+            value_out_num_convs=1,
+            key_query_norm=True,
+            value_out_norm=True,
+            matmul_norm=True,
+            with_out=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
+        if concat_input:
+            self.bottleneck = ConvModule(
+                feats_channels * 2,
+                feats_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+            )
+
+    '''forward'''
+
+    def forward(self, x):
+        x_global = self.global_avgpool(x)
+        x_global = resize(
+            x_global,
+            size=x.shape[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+        feats_il = self.correlate_net(x, torch.cat([x_global, x], dim=1))
+        if hasattr(self, 'bottleneck'):
+            feats_il = self.bottleneck(torch.cat([x, feats_il], dim=1))
+        return feats_il
+
+
+class SemanticLevelContext(nn.Module):
+    """ Semantic-Level Context Module
+    Args:
+        feats_channels (int): Input channels of query/key feature.
+        transform_channels (int): Output channels of key/query transform.
+        concat_input (bool): whether to concat input feature.
+        conv_cfg (dict|None): Config of conv layers.
+        norm_cfg (dict|None): Config of norm layers.
+        act_cfg (dict): Config of activation layers.
+    """
+
+    def __init__(self,
+                 feats_channels,
+                 transform_channels,
+                 concat_input=False,
+                 conv_cfg=None,
+                 norm_cfg=None,
+                 act_cfg=None):
+        super().__init__()
+        self.correlate_net = SelfAttentionBlock(
+            key_in_channels=feats_channels,
+            query_in_channels=feats_channels,
+            channels=transform_channels,
+            out_channels=feats_channels,
+            share_key_query=False,
+            query_downsample=None,
+            key_downsample=None,
+            key_query_num_convs=2,
+            value_out_num_convs=1,
+            key_query_norm=True,
+            value_out_norm=True,
+            matmul_norm=True,
+            with_out=True,
+            conv_cfg=conv_cfg,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
+        if concat_input:
+            self.bottleneck = ConvModule(
+                feats_channels * 2,
+                feats_channels,
+                kernel_size=3,
+                stride=1,
+                padding=1,
+                conv_cfg=conv_cfg,
+                norm_cfg=norm_cfg,
+                act_cfg=act_cfg,
+            )
+
+    '''forward'''
+
+    def forward(self, x, preds, feats_il):
+        inputs = x
+        batch_size, num_channels, h, w = x.size()
+        num_classes = preds.size(1)
+        feats_sl = torch.zeros(batch_size, h * w, num_channels).type_as(x)
+        for batch_idx in range(batch_size):
+            # (C, H, W), (num_classes, H, W) --> (H*W, C), (H*W, num_classes)
+            feats_iter, preds_iter = x[batch_idx], preds[batch_idx]
+            feats_iter, preds_iter = feats_iter.reshape(
+                num_channels, -1), preds_iter.reshape(num_classes, -1)
+            feats_iter, preds_iter = feats_iter.permute(1,
+                                                        0), preds_iter.permute(
+                                                            1, 0)
+            # (H*W, )
+            argmax = preds_iter.argmax(1)
+            for clsid in range(num_classes):
+                mask = (argmax == clsid)
+                if mask.sum() == 0:
+                    continue
+                feats_iter_cls = feats_iter[mask]
+                preds_iter_cls = preds_iter[:, clsid][mask]
+                weight = torch.softmax(preds_iter_cls, dim=0)
+                feats_iter_cls = feats_iter_cls * weight.unsqueeze(-1)
+                feats_iter_cls = feats_iter_cls.sum(0)
+                feats_sl[batch_idx][mask] = feats_iter_cls
+        feats_sl = feats_sl.reshape(batch_size, h, w, num_channels)
+        feats_sl = feats_sl.permute(0, 3, 1, 2).contiguous()
+        feats_sl = self.correlate_net(inputs, feats_sl)
+        if hasattr(self, 'bottleneck'):
+            feats_sl = self.bottleneck(torch.cat([feats_il, feats_sl], dim=1))
+        return feats_sl
+
+
+@MODELS.register_module()
+class ISNetHead(BaseDecodeHead):
+    """ISNet: Integrate Image-Level and Semantic-Level
+    Context for Semantic Segmentation
+
+    This head is the implementation of `ISNet`
+    <https://arxiv.org/pdf/2108.12382.pdf>`_.
+
+    Args:
+        transform_channels (int): Output channels of key/query transform.
+        concat_input (bool): whether to concat input feature.
+        with_shortcut (bool): whether to use shortcut connection.
+        shortcut_in_channels (int): Input channels of shortcut.
+        shortcut_feat_channels (int): Output channels of shortcut.
+        dropout_ratio (float): Ratio of dropout.
+    """
+
+    def __init__(self, transform_channels, concat_input, with_shortcut,
+                 shortcut_in_channels, shortcut_feat_channels, dropout_ratio,
+                 **kwargs):
+        super().__init__(**kwargs)
+
+        self.in_channels = self.in_channels[-1]
+
+        self.bottleneck = ConvModule(
+            self.in_channels,
+            self.channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            conv_cfg=self.conv_cfg,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.ilc_net = ImageLevelContext(
+            feats_channels=self.channels,
+            transform_channels=transform_channels,
+            concat_input=concat_input,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg,
+            align_corners=self.align_corners)
+        self.slc_net = SemanticLevelContext(
+            feats_channels=self.channels,
+            transform_channels=transform_channels,
+            concat_input=concat_input,
+            norm_cfg=self.norm_cfg,
+            act_cfg=self.act_cfg)
+
+        self.decoder_stage1 = nn.Sequential(
+            ConvModule(
+                self.channels,
+                self.channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg),
+            nn.Dropout2d(dropout_ratio),
+            nn.Conv2d(
+                self.channels,
+                self.num_classes,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True),
+        )
+
+        if with_shortcut:
+            self.shortcut = ConvModule(
+                shortcut_in_channels,
+                shortcut_feat_channels,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                conv_cfg=self.conv_cfg,
+                norm_cfg=self.norm_cfg,
+                act_cfg=self.act_cfg)
+            self.decoder_stage2 = nn.Sequential(
+                ConvModule(
+                    self.channels + shortcut_feat_channels,
+                    self.channels,
+                    kernel_size=3,
+                    stride=1,
+                    padding=1,
+                    conv_cfg=self.conv_cfg,
+                    norm_cfg=self.norm_cfg,
+                    act_cfg=self.act_cfg),
+                nn.Dropout2d(dropout_ratio),
+                nn.Conv2d(
+                    self.channels,
+                    self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=True),
+            )
+        else:
+            self.decoder_stage2 = nn.Sequential(
+                nn.Dropout2d(dropout_ratio),
+                nn.Conv2d(
+                    self.channels,
+                    self.num_classes,
+                    kernel_size=1,
+                    stride=1,
+                    padding=0,
+                    bias=True),
+            )
+
+        self.conv_seg = None
+        self.dropout = None
+
+    def forward(self, inputs):
+        x = self._transform_inputs(inputs)
+        feats = self.bottleneck(x[-1])
+
+        feats_il = self.ilc_net(feats)
+
+        preds_stage1 = self.decoder_stage1(feats)
+        preds_stage1 = resize(
+            preds_stage1,
+            size=feats.size()[2:],
+            mode='bilinear',
+            align_corners=self.align_corners)
+
+        feats_sl = self.slc_net(feats, preds_stage1, feats_il)
+
+        if hasattr(self, 'shortcut'):
+            shortcut_out = self.shortcut(x[0])
+            feats_sl = resize(
+                feats_sl,
+                size=shortcut_out.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            feats_sl = torch.cat([feats_sl, shortcut_out], dim=1)
+        preds_stage2 = self.decoder_stage2(feats_sl)
+
+        return preds_stage1, preds_stage2
+
+    def loss_by_feat(self, seg_logits: Tensor,
+                     batch_data_samples: SampleList) -> dict:
+        seg_label = self._stack_batch_gt(batch_data_samples)
+        loss = dict()
+
+        if self.sampler is not None:
+            seg_weight = self.sampler.sample(seg_logits[-1], seg_label)
+        else:
+            seg_weight = None
+        seg_label = seg_label.squeeze(1)
+
+        for seg_logit, loss_decode in zip(seg_logits, self.loss_decode):
+            seg_logit = resize(
+                input=seg_logit,
+                size=seg_label.shape[2:],
+                mode='bilinear',
+                align_corners=self.align_corners)
+            loss[loss_decode.name] = loss_decode(
+                seg_logit,
+                seg_label,
+                seg_weight,
+                ignore_index=self.ignore_index)
+
+        loss['acc_seg'] = accuracy(
+            seg_logits[-1], seg_label, ignore_index=self.ignore_index)
+        return loss
+
+    def predict_by_feat(self, seg_logits: Tensor,
+                        batch_img_metas: List[dict]) -> Tensor:
+        _, seg_logits_stage2 = seg_logits
+        return super().predict_by_feat(seg_logits_stage2, batch_img_metas)
diff --git a/projects/mapillary_dataset/README.md b/projects/mapillary_dataset/README.md
new file mode 100644
index 0000000000..44a1e33ef9
--- /dev/null
+++ b/projects/mapillary_dataset/README.md
@@ -0,0 +1,86 @@
+# Mapillary Vistas Dataset
+
+Support **`Mapillary Vistas Dataset`**
+
+## Description
+
+Author: AI-Tianlong
+
+This project implements **`Mapillary Vistas Dataset`**
+
+### Dataset preparing
+
+Preparing `Mapillary Vistas Dataset` dataset following [Mapillary Vistas Dataset Preparing Guide](https://github.com/open-mmlab/mmsegmentation/tree/main/projects/mapillary_dataset/docs/en/user_guides/2_dataset_prepare.md)
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── tools
+  ├── configs
+  ├── data
+  │   ├── mapillary
+  │   │   ├── training
+  │   │   │   ├── images
+  │   │   │   ├── v1.2
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── labels_mask
+  |   │   │   │   └── panoptic
+  │   │   │   ├── v2.0
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── labels_mask
+  |   │   │   │   ├── panoptic
+  |   │   │   │   └── polygons
+  │   │   ├── validation
+  │   │   │   ├── images
+  │   │   │   ├── v1.2
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── labels_mask
+  |   │   │   │   └── panoptic
+  │   │   │   ├── v2.0
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── labels_mask
+  |   │   │   │   ├── panoptic
+  |   │   │   │   └── polygons
+```
+
+### Training commands
+
+```bash
+# Dataset train commands
+# at `mmsegmentation` folder
+bash tools/dist_train.sh projects/mapillary_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_mapillay_v1-512x1024.py 4
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [ ] Test-time correctness
+
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [x] Milestone 3: Good to be a part of our core package!
+
+  - [x] Type hints and docstrings
+
+  - [x] Unit tests
+
+  - [x] Code polishing
+
+  - [x] Metafile.yml
+
+- [x] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [x] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v1.py b/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v1.py
new file mode 100644
index 0000000000..611aa4741b
--- /dev/null
+++ b/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v1.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'MapillaryDataset_v1'
+data_root = 'data/mapillary/'
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='training/images', seg_map_path='training/v1.2/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='validation/images',
+            seg_map_path='validation/v1.2/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v1_65.py b/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v1_65.py
new file mode 100644
index 0000000000..f594f37333
--- /dev/null
+++ b/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v1_65.py
@@ -0,0 +1,37 @@
+# dataset settings
+_base_ = './mapillary_v1.py'
+metainfo = dict(
+    classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail', 'Barrier',
+             'Wall', 'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Parking',
+             'Pedestrian Area', 'Rail Track', 'Road', 'Service Lane',
+             'Sidewalk', 'Bridge', 'Building', 'Tunnel', 'Person', 'Bicyclist',
+             'Motorcyclist', 'Other Rider', 'Lane Marking - Crosswalk',
+             'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow',
+             'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack',
+             'Billboard', 'Catch Basin', 'CCTV Camera', 'Fire Hydrant',
+             'Junction Box', 'Mailbox', 'Manhole', 'Phone Booth', 'Pothole',
+             'Street Light', 'Pole', 'Traffic Sign Frame', 'Utility Pole',
+             'Traffic Light', 'Traffic Sign (Back)', 'Traffic Sign (Front)',
+             'Trash Can', 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan',
+             'Motorcycle', 'On Rails', 'Other Vehicle', 'Trailer', 'Truck',
+             'Wheeled Slow', 'Car Mount', 'Ego Vehicle'),
+    palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+             [180, 165, 180], [90, 120, 150], [102, 102, 156], [128, 64, 255],
+             [140, 140, 200], [170, 170, 170], [250, 170, 160], [96, 96, 96],
+             [230, 150, 140], [128, 64, 128], [110, 110, 110], [244, 35, 232],
+             [150, 100, 100], [70, 70, 70], [150, 120, 90], [220, 20, 60],
+             [255, 0, 0], [255, 0, 100], [255, 0, 200], [200, 128, 128],
+             [255, 255, 255], [64, 170, 64], [230, 160, 50], [70, 130, 180],
+             [190, 255, 255], [152, 251, 152], [107, 142, 35], [0, 170, 30],
+             [255, 255, 128], [250, 0, 30], [100, 140, 180], [220, 220, 220],
+             [220, 128, 128], [222, 40, 40], [100, 170, 30], [40, 40, 40],
+             [33, 33, 33], [100, 128, 160], [142, 0, 0], [70, 100, 150],
+             [210, 170, 100], [153, 153, 153], [128, 128, 128], [0, 0, 80],
+             [250, 170, 30], [192, 192, 192], [220, 220, 0], [140, 140, 20],
+             [119, 11, 32], [150, 0, 255], [0, 60, 100], [0, 0, 142],
+             [0, 0, 90], [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+             [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10, 10]])
+
+train_dataloader = dict(dataset=dict(metainfo=metainfo))
+val_dataloader = dict(dataset=dict(metainfo=metainfo))
+test_dataloader = val_dataloader
diff --git a/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v2.py b/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v2.py
new file mode 100644
index 0000000000..7cb7a958e5
--- /dev/null
+++ b/projects/mapillary_dataset/configs/_base_/datasets/mapillary_v2.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'MapillaryDataset_v2'
+data_root = 'data/mapillary/'
+crop_size = (512, 1024)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        type='RandomResize',
+        scale=(2048, 1024),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 1024), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=2,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='training/images', seg_map_path='training/v2.0/labels'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='validation/images',
+            seg_map_path='validation/v2.0/labels'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/mapillary_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_mapillay_v1-512x1024.py b/projects/mapillary_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_mapillay_v1-512x1024.py
new file mode 100644
index 0000000000..b559e0d6aa
--- /dev/null
+++ b/projects/mapillary_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_mapillay_v1-512x1024.py
@@ -0,0 +1,17 @@
+_base_ = [
+    '../../../configs/_base_/models/deeplabv3plus_r50-d8.py',
+    './_base_/datasets/mapillary_v1.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_240k.py'
+]
+custom_imports = dict(
+    imports=['projects.mapillary_dataset.mmseg.datasets.mapillary'])
+
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101),
+    decode_head=dict(num_classes=66),
+    auxiliary_head=dict(num_classes=66))
diff --git a/projects/mapillary_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_mapillay_v2-512x1024.py b/projects/mapillary_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_mapillay_v2-512x1024.py
new file mode 100644
index 0000000000..cfe31a2c12
--- /dev/null
+++ b/projects/mapillary_dataset/configs/deeplabv3plus_r101-d8_4xb2-240k_mapillay_v2-512x1024.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../../../configs/_base_/models/deeplabv3plus_r50-d8.py',
+    './_base_/datasets/mapillary_v2.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_240k.py'
+]
+custom_imports = dict(
+    imports=['projects.mapillary_dataset.mmseg.datasets.mapillary'])
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101),
+    decode_head=dict(num_classes=124),
+    auxiliary_head=dict(num_classes=124))
diff --git a/projects/mapillary_dataset/configs/pspnet_r101-d8_4xb2-240k_mapillay_v1-512x1024.py b/projects/mapillary_dataset/configs/pspnet_r101-d8_4xb2-240k_mapillay_v1-512x1024.py
new file mode 100644
index 0000000000..1ca2b57f73
--- /dev/null
+++ b/projects/mapillary_dataset/configs/pspnet_r101-d8_4xb2-240k_mapillay_v1-512x1024.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../../../configs/_base_/models/pspnet_r50-d8.py',
+    './_base_/datasets/mapillary_v1.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_240k.py'
+]
+custom_imports = dict(
+    imports=['projects.mapillary_dataset.mmseg.datasets.mapillary'])
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101),
+    decode_head=dict(num_classes=66),
+    auxiliary_head=dict(num_classes=66))
diff --git a/projects/mapillary_dataset/configs/pspnet_r101-d8_4xb2-240k_mapillay_v2-512x1024.py b/projects/mapillary_dataset/configs/pspnet_r101-d8_4xb2-240k_mapillay_v2-512x1024.py
new file mode 100644
index 0000000000..c04746a3dc
--- /dev/null
+++ b/projects/mapillary_dataset/configs/pspnet_r101-d8_4xb2-240k_mapillay_v2-512x1024.py
@@ -0,0 +1,16 @@
+_base_ = [
+    '../../../configs/_base_/models/pspnet_r50-d8.py',
+    './_base_/datasets/mapillary_v2.py',
+    '../../../configs/_base_/default_runtime.py',
+    '../../../configs/_base_/schedules/schedule_240k.py'
+]
+custom_imports = dict(
+    imports=['projects.mapillary_dataset.mmseg.datasets.mapillary'])
+crop_size = (512, 1024)
+data_preprocessor = dict(size=crop_size)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    pretrained='open-mmlab://resnet101_v1c',
+    backbone=dict(depth=101),
+    decode_head=dict(num_classes=124),
+    auxiliary_head=dict(num_classes=124))
diff --git a/projects/mapillary_dataset/docs/en/user_guides/2_dataset_prepare.md b/projects/mapillary_dataset/docs/en/user_guides/2_dataset_prepare.md
new file mode 100644
index 0000000000..c5cbc0f9b8
--- /dev/null
+++ b/projects/mapillary_dataset/docs/en/user_guides/2_dataset_prepare.md
@@ -0,0 +1,255 @@
+## Mapillary Vistas Datasets
+
+- The dataset could be download [here](https://www.mapillary.com/dataset/vistas) after registration.
+
+- Mapillary Vistas Dataset use 8-bit with color-palette to store labels. No conversion operation is required.
+
+- Assumption you have put the dataset zip file in `mmsegmentation/data/mapillary`
+
+- Please run the following commands to unzip dataset.
+
+  ```bash
+  cd data/mapillary
+  unzip An-ZjB1Zm61yAZG0ozTymz8I8NqI4x0MrYrh26dq7kPgfu8vf9ImrdaOAVOFYbJ2pNAgUnVGBmbue9lTgdBOb5BbKXIpFs0fpYWqACbrQDChAA2fdX0zS9PcHu7fY8c-FOvyBVxPNYNFQuM.zip
+  ```
+
+- After unzip, you will get Mapillary Vistas Dataset like this structure. Semantic segmentation mask labels in `labels` folder.
+
+  ```none
+  mmsegmentation
+  ├── mmseg
+  ├── tools
+  ├── configs
+  ├── data
+  │   ├── mapillary
+  │   │   ├── training
+  │   │   │   ├── images
+  │   │   │   ├── v1.2
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   └── panoptic
+  │   │   │   ├── v2.0
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── panoptic
+  |   │   │   │   └── polygons
+  │   │   ├── validation
+  │   │   │   ├── images
+  |   │   │   ├── v1.2
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   └── panoptic
+  │   │   │   ├── v2.0
+  |   │   │   │   ├── instances
+  |   │   │   │   ├── labels
+  |   │   │   │   ├── panoptic
+  |   │   │   │   └── polygons
+  ```
+
+- You could set Datasets version with `MapillaryDataset_v1` and `MapillaryDataset_v2` in your configs.
+  View the Mapillary Vistas Datasets config file here [V1.2](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/_base_/datasets/mapillary_v1.py) and  [V2.0](https://github.com/open-mmlab/mmsegmentation/blob/main/configs/_base_/datasets/mapillary_v2.py)
+
+- **View datasets labels index and palette**
+
+- **Mapillary Vistas Datasets labels information**
+  **v1.2 information**
+
+  ```none
+  There are 66 labels classes in v1.2
+  0--Bird--[165, 42, 42],
+  1--Ground Animal--[0, 192, 0],
+  2--Curb--[196, 196, 196],
+  3--Fence--[190, 153, 153],
+  4--Guard Rail--[180, 165, 180],
+  5--Barrier--[90, 120, 150],
+  6--Wall--[102, 102, 156],
+  7--Bike Lane--[128, 64, 255],
+  8--Crosswalk - Plain--[140, 140, 200],
+  9--Curb Cut--[170, 170, 170],
+  10--Parking--[250, 170, 160],
+  11--Pedestrian Area--[96, 96, 96],
+  12--Rail Track--[230, 150, 140],
+  13--Road--[128, 64, 128],
+  14--Service Lane--[110, 110, 110],
+  15--Sidewalk--[244, 35, 232],
+  16--Bridge--[150, 100, 100],
+  17--Building--[70, 70, 70],
+  18--Tunnel--[150, 120, 90],
+  19--Person--[220, 20, 60],
+  20--Bicyclist--[255, 0, 0],
+  21--Motorcyclist--[255, 0, 100],
+  22--Other Rider--[255, 0, 200],
+  23--Lane Marking - Crosswalk--[200, 128, 128],
+  24--Lane Marking - General--[255, 255, 255],
+  25--Mountain--[64, 170, 64],
+  26--Sand--[230, 160, 50],
+  27--Sky--[70, 130, 180],
+  28--Snow--[190, 255, 255],
+  29--Terrain--[152, 251, 152],
+  30--Vegetation--[107, 142, 35],
+  31--Water--[0, 170, 30],
+  32--Banner--[255, 255, 128],
+  33--Bench--[250, 0, 30],
+  34--Bike Rack--[100, 140, 180],
+  35--Billboard--[220, 220, 220],
+  36--Catch Basin--[220, 128, 128],
+  37--CCTV Camera--[222, 40, 40],
+  38--Fire Hydrant--[100, 170, 30],
+  39--Junction Box--[40, 40, 40],
+  40--Mailbox--[33, 33, 33],
+  41--Manhole--[100, 128, 160],
+  42--Phone Booth--[142, 0, 0],
+  43--Pothole--[70, 100, 150],
+  44--Street Light--[210, 170, 100],
+  45--Pole--[153, 153, 153],
+  46--Traffic Sign Frame--[128, 128, 128],
+  47--Utility Pole--[0, 0, 80],
+  48--Traffic Light--[250, 170, 30],
+  49--Traffic Sign (Back)--[192, 192, 192],
+  50--Traffic Sign (Front)--[220, 220, 0],
+  51--Trash Can--[140, 140, 20],
+  52--Bicycle--[119, 11, 32],
+  53--Boat--[150, 0, 255],
+  54--Bus--[0, 60, 100],
+  55--Car--[0, 0, 142],
+  56--Caravan--[0, 0, 90],
+  57--Motorcycle--[0, 0, 230],
+  58--On Rails--[0, 80, 100],
+  59--Other Vehicle--[128, 64, 64],
+  60--Trailer--[0, 0, 110],
+  61--Truck--[0, 0, 70],
+  62--Wheeled Slow--[0, 0, 192],
+  63--Car Mount--[32, 32, 32],
+  64--Ego Vehicle--[120, 10, 10],
+  65--Unlabeled--[0, 0, 0]
+  ```
+
+  **v2.0 information**
+
+  ```none
+  There are 124 labels classes in v2.0
+  0--Bird--[165, 42, 42],
+  1--Ground Animal--[0, 192, 0],
+  2--Ambiguous Barrier--[250, 170, 31],
+  3--Concrete Block--[250, 170, 32],
+  4--Curb--[196, 196, 196],
+  5--Fence--[190, 153, 153],
+  6--Guard Rail--[180, 165, 180],
+  7--Barrier--[90, 120, 150],
+  8--Road Median--[250, 170, 33],
+  9--Road Side--[250, 170, 34],
+  10--Lane Separator--[128, 128, 128],
+  11--Temporary Barrier--[250, 170, 35],
+  12--Wall--[102, 102, 156],
+  13--Bike Lane--[128, 64, 255],
+  14--Crosswalk - Plain--[140, 140, 200],
+  15--Curb Cut--[170, 170, 170],
+  16--Driveway--[250, 170, 36],
+  17--Parking--[250, 170, 160],
+  18--Parking Aisle--[250, 170, 37],
+  19--Pedestrian Area--[96, 96, 96],
+  20--Rail Track--[230, 150, 140],
+  21--Road--[128, 64, 128],
+  22--Road Shoulder--[110, 110, 110],
+  23--Service Lane--[110, 110, 110],
+  24--Sidewalk--[244, 35, 232],
+  25--Traffic Island--[128, 196, 128],
+  26--Bridge--[150, 100, 100],
+  27--Building--[70, 70, 70],
+  28--Garage--[150, 150, 150],
+  29--Tunnel--[150, 120, 90],
+  30--Person--[220, 20, 60],
+  31--Person Group--[220, 20, 60],
+  32--Bicyclist--[255, 0, 0],
+  33--Motorcyclist--[255, 0, 100],
+  34--Other Rider--[255, 0, 200],
+  35--Lane Marking - Dashed Line--[255, 255, 255],
+  36--Lane Marking - Straight Line--[255, 255, 255],
+  37--Lane Marking - Zigzag Line--[250, 170, 29],
+  38--Lane Marking - Ambiguous--[250, 170, 28],
+  39--Lane Marking - Arrow (Left)--[250, 170, 26],
+  40--Lane Marking - Arrow (Other)--[250, 170, 25],
+  41--Lane Marking - Arrow (Right)--[250, 170, 24],
+  42--Lane Marking - Arrow (Split Left or Straight)--[250, 170, 22],
+  43--Lane Marking - Arrow (Split Right or Straight)--[250, 170, 21],
+  44--Lane Marking - Arrow (Straight)--[250, 170, 20],
+  45--Lane Marking - Crosswalk--[255, 255, 255],
+  46--Lane Marking - Give Way (Row)--[250, 170, 19],
+  47--Lane Marking - Give Way (Single)--[250, 170, 18],
+  48--Lane Marking - Hatched (Chevron)--[250, 170, 12],
+  49--Lane Marking - Hatched (Diagonal)--[250, 170, 11],
+  50--Lane Marking - Other--[255, 255, 255],
+  51--Lane Marking - Stop Line--[255, 255, 255],
+  52--Lane Marking - Symbol (Bicycle)--[250, 170, 16],
+  53--Lane Marking - Symbol (Other)--[250, 170, 15],
+  54--Lane Marking - Text--[250, 170, 15],
+  55--Lane Marking (only) - Dashed Line--[255, 255, 255],
+  56--Lane Marking (only) - Crosswalk--[255, 255, 255],
+  57--Lane Marking (only) - Other--[255, 255, 255],
+  58--Lane Marking (only) - Test--[255, 255, 255],
+  59--Mountain--[64, 170, 64],
+  60--Sand--[230, 160, 50],
+  61--Sky--[70, 130, 180],
+  62--Snow--[190, 255, 255],
+  63--Terrain--[152, 251, 152],
+  64--Vegetation--[107, 142, 35],
+  65--Water--[0, 170, 30],
+  66--Banner--[255, 255, 128],
+  67--Bench--[250, 0, 30],
+  68--Bike Rack--[100, 140, 180],
+  69--Catch Basin--[220, 128, 128],
+  70--CCTV Camera--[222, 40, 40],
+  71--Fire Hydrant--[100, 170, 30],
+  72--Junction Box--[40, 40, 40],
+  73--Mailbox--[33, 33, 33],
+  74--Manhole--[100, 128, 160],
+  75--Parking Meter--[20, 20, 255],
+  76--Phone Booth--[142, 0, 0],
+  77--Pothole--[70, 100, 150],
+  78--Signage - Advertisement--[250, 171, 30],
+  79--Signage - Ambiguous--[250, 172, 30],
+  80--Signage - Back--[250, 173, 30],
+  81--Signage - Information--[250, 174, 30],
+  82--Signage - Other--[250, 175, 30],
+  83--Signage - Store--[250, 176, 30],
+  84--Street Light--[210, 170, 100],
+  85--Pole--[153, 153, 153],
+  86--Pole Group--[153, 153, 153],
+  87--Traffic Sign Frame--[128, 128, 128],
+  88--Utility Pole--[0, 0, 80],
+  89--Traffic Cone--[210, 60, 60],
+  90--Traffic Light - General (Single)--[250, 170, 30],
+  91--Traffic Light - Pedestrians--[250, 170, 30],
+  92--Traffic Light - General (Upright)--[250, 170, 30],
+  93--Traffic Light - General (Horizontal)--[250, 170, 30],
+  94--Traffic Light - Cyclists--[250, 170, 30],
+  95--Traffic Light - Other--[250, 170, 30],
+  96--Traffic Sign - Ambiguous--[192, 192, 192],
+  97--Traffic Sign (Back)--[192, 192, 192],
+  98--Traffic Sign - Direction (Back)--[192, 192, 192],
+  99--Traffic Sign - Direction (Front)--[220, 220, 0],
+  100--Traffic Sign (Front)--[220, 220, 0],
+  101--Traffic Sign - Parking--[0, 0, 196],
+  102--Traffic Sign - Temporary (Back)--[192, 192, 192],
+  103--Traffic Sign - Temporary (Front)--[220, 220, 0],
+  104--Trash Can--[140, 140, 20],
+  105--Bicycle--[119, 11, 32],
+  106--Boat--[150, 0, 255],
+  107--Bus--[0, 60, 100],
+  108--Car--[0, 0, 142],
+  109--Caravan--[0, 0, 90],
+  110--Motorcycle--[0, 0, 230],
+  111--On Rails--[0, 80, 100],
+  112--Other Vehicle--[128, 64, 64],
+  113--Trailer--[0, 0, 110],
+  114--Truck--[0, 0, 70],
+  115--Vehicle Group--[0, 0, 142],
+  116--Wheeled Slow--[0, 0, 192],
+  117--Water Valve--[170, 170, 170],
+  118--Car Mount--[32, 32, 32],
+  119--Dynamic--[111, 74, 0],
+  120--Ego Vehicle--[120, 10, 10],
+  121--Ground--[81, 0, 81],
+  122--Static--[111, 111, 0],
+  123--Unlabeled--[0, 0, 0]
+  ```
diff --git a/projects/mapillary_dataset/mmseg/datasets/mapillary.py b/projects/mapillary_dataset/mmseg/datasets/mapillary.py
new file mode 100644
index 0000000000..f49bd54451
--- /dev/null
+++ b/projects/mapillary_dataset/mmseg/datasets/mapillary.py
@@ -0,0 +1,177 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmseg.datasets.basesegdataset import BaseSegDataset
+
+# from mmseg.registry import DATASETS
+
+
+# @DATASETS.register_module()
+class MapillaryDataset_v1(BaseSegDataset):
+    """Mapillary Vistas Dataset.
+
+    Dataset paper link:
+    http://ieeexplore.ieee.org/document/8237796/
+
+    v1.2 contain 66 object classes.
+    (37 instance-specific)
+
+    v2.0 contain 124 object classes.
+    (70 instance-specific, 46 stuff, 8 void or crowd).
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png' for Mapillary Vistas Dataset.
+    """
+    METAINFO = dict(
+        classes=('Bird', 'Ground Animal', 'Curb', 'Fence', 'Guard Rail',
+                 'Barrier', 'Wall', 'Bike Lane', 'Crosswalk - Plain',
+                 'Curb Cut', 'Parking', 'Pedestrian Area', 'Rail Track',
+                 'Road', 'Service Lane', 'Sidewalk', 'Bridge', 'Building',
+                 'Tunnel', 'Person', 'Bicyclist', 'Motorcyclist',
+                 'Other Rider', 'Lane Marking - Crosswalk',
+                 'Lane Marking - General', 'Mountain', 'Sand', 'Sky', 'Snow',
+                 'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench',
+                 'Bike Rack', 'Billboard', 'Catch Basin', 'CCTV Camera',
+                 'Fire Hydrant', 'Junction Box', 'Mailbox', 'Manhole',
+                 'Phone Booth', 'Pothole', 'Street Light', 'Pole',
+                 'Traffic Sign Frame', 'Utility Pole', 'Traffic Light',
+                 'Traffic Sign (Back)', 'Traffic Sign (Front)', 'Trash Can',
+                 'Bicycle', 'Boat', 'Bus', 'Car', 'Caravan', 'Motorcycle',
+                 'On Rails', 'Other Vehicle', 'Trailer', 'Truck',
+                 'Wheeled Slow', 'Car Mount', 'Ego Vehicle', 'Unlabeled'),
+        palette=[[165, 42, 42], [0, 192, 0], [196, 196, 196], [190, 153, 153],
+                 [180, 165, 180], [90, 120, 150], [102, 102, 156],
+                 [128, 64, 255], [140, 140, 200], [170, 170, 170],
+                 [250, 170, 160], [96, 96, 96],
+                 [230, 150, 140], [128, 64, 128], [110, 110, 110],
+                 [244, 35, 232], [150, 100, 100], [70, 70, 70], [150, 120, 90],
+                 [220, 20, 60], [255, 0, 0], [255, 0, 100], [255, 0, 200],
+                 [200, 128, 128], [255, 255, 255], [64, 170,
+                                                    64], [230, 160, 50],
+                 [70, 130, 180], [190, 255, 255], [152, 251, 152],
+                 [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+                 [100, 140, 180], [220, 220, 220], [220, 128, 128],
+                 [222, 40, 40], [100, 170, 30], [40, 40, 40], [33, 33, 33],
+                 [100, 128, 160], [142, 0, 0], [70, 100, 150], [210, 170, 100],
+                 [153, 153, 153], [128, 128, 128], [0, 0, 80], [250, 170, 30],
+                 [192, 192, 192], [220, 220, 0], [140, 140, 20], [119, 11, 32],
+                 [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+                 [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+                 [0, 0, 70], [0, 0, 192], [32, 32, 32], [120, 10,
+                                                         10], [0, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
+
+
+# @DATASETS.register_module()
+class MapillaryDataset_v2(BaseSegDataset):
+    """Mapillary Vistas Dataset.
+
+    Dataset paper link:
+    http://ieeexplore.ieee.org/document/8237796/
+
+    v1.2 contain 66 object classes.
+    (37 instance-specific)
+
+    v2.0 contain 124 object classes.
+    (70 instance-specific, 46 stuff, 8 void or crowd).
+
+    The ``img_suffix`` is fixed to '.jpg' and ``seg_map_suffix`` is
+    fixed to '.png' for Mapillary Vistas Dataset.
+    """
+    METAINFO = dict(
+        classes=(
+            'Bird', 'Ground Animal', 'Ambiguous Barrier', 'Concrete Block',
+            'Curb', 'Fence', 'Guard Rail', 'Barrier', 'Road Median',
+            'Road Side', 'Lane Separator', 'Temporary Barrier', 'Wall',
+            'Bike Lane', 'Crosswalk - Plain', 'Curb Cut', 'Driveway',
+            'Parking', 'Parking Aisle', 'Pedestrian Area', 'Rail Track',
+            'Road', 'Road Shoulder', 'Service Lane', 'Sidewalk',
+            'Traffic Island', 'Bridge', 'Building', 'Garage', 'Tunnel',
+            'Person', 'Person Group', 'Bicyclist', 'Motorcyclist',
+            'Other Rider', 'Lane Marking - Dashed Line',
+            'Lane Marking - Straight Line', 'Lane Marking - Zigzag Line',
+            'Lane Marking - Ambiguous', 'Lane Marking - Arrow (Left)',
+            'Lane Marking - Arrow (Other)', 'Lane Marking - Arrow (Right)',
+            'Lane Marking - Arrow (Split Left or Straight)',
+            'Lane Marking - Arrow (Split Right or Straight)',
+            'Lane Marking - Arrow (Straight)', 'Lane Marking - Crosswalk',
+            'Lane Marking - Give Way (Row)',
+            'Lane Marking - Give Way (Single)',
+            'Lane Marking - Hatched (Chevron)',
+            'Lane Marking - Hatched (Diagonal)', 'Lane Marking - Other',
+            'Lane Marking - Stop Line', 'Lane Marking - Symbol (Bicycle)',
+            'Lane Marking - Symbol (Other)', 'Lane Marking - Text',
+            'Lane Marking (only) - Dashed Line',
+            'Lane Marking (only) - Crosswalk', 'Lane Marking (only) - Other',
+            'Lane Marking (only) - Test', 'Mountain', 'Sand', 'Sky', 'Snow',
+            'Terrain', 'Vegetation', 'Water', 'Banner', 'Bench', 'Bike Rack',
+            'Catch Basin', 'CCTV Camera', 'Fire Hydrant', 'Junction Box',
+            'Mailbox', 'Manhole', 'Parking Meter', 'Phone Booth', 'Pothole',
+            'Signage - Advertisement', 'Signage - Ambiguous', 'Signage - Back',
+            'Signage - Information', 'Signage - Other', 'Signage - Store',
+            'Street Light', 'Pole', 'Pole Group', 'Traffic Sign Frame',
+            'Utility Pole', 'Traffic Cone', 'Traffic Light - General (Single)',
+            'Traffic Light - Pedestrians', 'Traffic Light - General (Upright)',
+            'Traffic Light - General (Horizontal)', 'Traffic Light - Cyclists',
+            'Traffic Light - Other', 'Traffic Sign - Ambiguous',
+            'Traffic Sign (Back)', 'Traffic Sign - Direction (Back)',
+            'Traffic Sign - Direction (Front)', 'Traffic Sign (Front)',
+            'Traffic Sign - Parking', 'Traffic Sign - Temporary (Back)',
+            'Traffic Sign - Temporary (Front)', 'Trash Can', 'Bicycle', 'Boat',
+            'Bus', 'Car', 'Caravan', 'Motorcycle', 'On Rails', 'Other Vehicle',
+            'Trailer', 'Truck', 'Vehicle Group', 'Wheeled Slow', 'Water Valve',
+            'Car Mount', 'Dynamic', 'Ego Vehicle', 'Ground', 'Static',
+            'Unlabeled'),
+        palette=[[165, 42, 42], [0, 192, 0], [250, 170, 31], [250, 170, 32],
+                 [196, 196, 196], [190, 153, 153], [180, 165, 180],
+                 [90, 120, 150], [250, 170, 33], [250, 170, 34],
+                 [128, 128, 128], [250, 170, 35], [102, 102, 156],
+                 [128, 64, 255], [140, 140, 200], [170, 170, 170],
+                 [250, 170, 36], [250, 170, 160], [250, 170, 37], [96, 96, 96],
+                 [230, 150, 140], [128, 64, 128], [110, 110, 110],
+                 [110, 110, 110], [244, 35, 232], [128, 196,
+                                                   128], [150, 100, 100],
+                 [70, 70, 70], [150, 150, 150], [150, 120, 90], [220, 20, 60],
+                 [220, 20, 60], [255, 0, 0], [255, 0, 100], [255, 0, 200],
+                 [255, 255, 255], [255, 255, 255], [250, 170, 29],
+                 [250, 170, 28], [250, 170, 26], [250, 170,
+                                                  25], [250, 170, 24],
+                 [250, 170, 22], [250, 170, 21], [250, 170,
+                                                  20], [255, 255, 255],
+                 [250, 170, 19], [250, 170, 18], [250, 170,
+                                                  12], [250, 170, 11],
+                 [255, 255, 255], [255, 255, 255], [250, 170, 16],
+                 [250, 170, 15], [250, 170, 15], [255, 255, 255],
+                 [255, 255, 255], [255, 255, 255], [255, 255, 255],
+                 [64, 170, 64], [230, 160, 50],
+                 [70, 130, 180], [190, 255, 255], [152, 251, 152],
+                 [107, 142, 35], [0, 170, 30], [255, 255, 128], [250, 0, 30],
+                 [100, 140, 180], [220, 128, 128], [222, 40,
+                                                    40], [100, 170, 30],
+                 [40, 40, 40], [33, 33, 33], [100, 128, 160], [20, 20, 255],
+                 [142, 0, 0], [70, 100, 150], [250, 171, 30], [250, 172, 30],
+                 [250, 173, 30], [250, 174, 30], [250, 175,
+                                                  30], [250, 176, 30],
+                 [210, 170, 100], [153, 153, 153], [153, 153, 153],
+                 [128, 128, 128], [0, 0, 80], [210, 60, 60], [250, 170, 30],
+                 [250, 170, 30], [250, 170, 30], [250, 170,
+                                                  30], [250, 170, 30],
+                 [250, 170, 30], [192, 192, 192], [192, 192, 192],
+                 [192, 192, 192], [220, 220, 0], [220, 220, 0], [0, 0, 196],
+                 [192, 192, 192], [220, 220, 0], [140, 140, 20], [119, 11, 32],
+                 [150, 0, 255], [0, 60, 100], [0, 0, 142], [0, 0, 90],
+                 [0, 0, 230], [0, 80, 100], [128, 64, 64], [0, 0, 110],
+                 [0, 0, 70], [0, 0, 142], [0, 0, 192], [170, 170, 170],
+                 [32, 32, 32], [111, 74, 0], [120, 10, 10], [81, 0, 81],
+                 [111, 111, 0], [0, 0, 0]])
+
+    def __init__(self,
+                 img_suffix='.jpg',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix, seg_map_suffix=seg_map_suffix, **kwargs)
diff --git a/projects/medical/2d_image/ct/cranium/README.md b/projects/medical/2d_image/ct/cranium/README.md
new file mode 100644
index 0000000000..d3fa64ea40
--- /dev/null
+++ b/projects/medical/2d_image/ct/cranium/README.md
@@ -0,0 +1,142 @@
+# Brain CT Images with Intracranial Hemorrhage Masks (Cranium)
+
+## Description
+
+This project supports **`Brain CT Images with Intracranial Hemorrhage Masks (Cranium)`**, which can be downloaded from [here](https://www.kaggle.com/datasets/vbookshelf/computed-tomography-ct-images).
+
+### Dataset Overview
+
+This dataset consists of head CT (Computed Thomography) images in jpg format. There are 2500 brain window images and 2500 bone window images, for 82 patients. There are approximately 30 image slices per patient. 318 images have associated intracranial image masks. Also included are csv files containing hemorrhage diagnosis data and patient data.
+This is version 1.0.0 of this dataset. A full description of this dataset as well as updated versions can be found here:
+https://physionet.org/content/ct-ich/1.0.0/
+
+### Statistic Information
+
+| Dataset Name                                                                        | Anatomical Region | Task Type    | Modality | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                   |
+| ----------------------------------------------------------------------------------- | ----------------- | ------------ | -------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------- |
+| [Cranium](https://www.kaggle.com/datasets/vbookshelf/computed-tomography-ct-images) | head_and_neck     | segmentation | ct       | 2            | 2501/-/-              | yes/-/-                | 2020         | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    2501    |   99.93    |    -     |    -     |     -     |     -     |
+| hemorrhage |    318     |    0.07    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![cranium](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/ct/cranium/cranium_dataset.png?raw=true)
+
+## Dataset Citation
+
+```
+@article{hssayeni2020computed,
+	title={Computed tomography images for intracranial hemorrhage detection and segmentation},
+	author={Hssayeni, Murtadha and Croock, MS and Salman, AD and Al-khafaji, HF and Yahya, ZA and Ghoraani, B},
+	journal={Intracranial Hemorrhage Segmentation Using A Deep Convolutional Model. Data},
+	volume={5},
+	number={1},
+	pages={179},
+	year={2020}
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0 9.3.0
+- scikit-learn(sklearn) v1.2.0 1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `cranium/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://www.kaggle.com/datasets/vbookshelf/computed-tomography-ct-images) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── ct
+  │   │   │   │   ├── cranium
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    2000    |   99.93    |   501    |  99.92   |     -     |     -     |
+| hemorrhage |    260     |    0.07    |   260    |   0.08   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/ct/cranium/configs/cranium_512x512.py b/projects/medical/2d_image/ct/cranium/configs/cranium_512x512.py
new file mode 100644
index 0000000000..d9b44362a5
--- /dev/null
+++ b/projects/medical/2d_image/ct/cranium/configs/cranium_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'CraniumDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_cranium-512x512.py b/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_cranium-512x512.py
new file mode 100644
index 0000000000..ac013a215a
--- /dev/null
+++ b/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_cranium-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './cranium_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.cranium_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_cranium-512x512.py b/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_cranium-512x512.py
new file mode 100644
index 0000000000..c71110a21f
--- /dev/null
+++ b/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_cranium-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './cranium_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.cranium_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_cranium-512x512.py b/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_cranium-512x512.py
new file mode 100644
index 0000000000..abbdac285b
--- /dev/null
+++ b/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_cranium-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './cranium_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.cranium_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_cranium-512x512.py b/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_cranium-512x512.py
new file mode 100644
index 0000000000..418595268f
--- /dev/null
+++ b/projects/medical/2d_image/ct/cranium/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_cranium-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './cranium_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.cranium_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/ct/cranium/datasets/cranium_dataset.py b/projects/medical/2d_image/ct/cranium/datasets/cranium_dataset.py
new file mode 100644
index 0000000000..d65f1cbfc6
--- /dev/null
+++ b/projects/medical/2d_image/ct/cranium/datasets/cranium_dataset.py
@@ -0,0 +1,31 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class CraniumDataset(BaseSegDataset):
+    """CraniumDataset dataset.
+
+    In segmentation map annotation for CraniumDataset,
+    0 stands for background, which is included in 2 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('background', 'hemorrhage'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/ct/cranium/tools/prepare_dataset.py b/projects/medical/2d_image/ct/cranium/tools/prepare_dataset.py
new file mode 100644
index 0000000000..1aa4e43561
--- /dev/null
+++ b/projects/medical/2d_image/ct/cranium/tools/prepare_dataset.py
@@ -0,0 +1,66 @@
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.png'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+tgt_img_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_dir = os.path.join(root_path, 'masks/train/')
+os.system('mkdir -p ' + tgt_img_dir)
+os.system('mkdir -p ' + tgt_mask_dir)
+
+
+def read_single_array_from_pil(path):
+    return np.asarray(Image.open(path))
+
+
+def save_png_from_array(arr, save_path, mode=None):
+    Image.fromarray(arr, mode=mode).save(save_path)
+
+
+def convert_label(img, convert_dict):
+    arr = np.zeros_like(img, dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr[img == c] = i
+    return arr
+
+
+patients_dir = os.path.join(
+    root_path, 'Cranium/computed-tomography-images-for-' +
+    'intracranial-hemorrhage-detection-and-segmentation-1.0.0' +
+    '/Patients_CT')
+
+patients = sorted(os.listdir(patients_dir))
+for p in patients:
+    data_dir = os.path.join(patients_dir, p, 'brain')
+    file_names = os.listdir(data_dir)
+    img_w_mask_names = [
+        _.replace('_HGE_Seg', '') for _ in file_names if 'Seg' in _
+    ]
+    img_wo_mask_names = [
+        _ for _ in file_names if _ not in img_w_mask_names and 'Seg' not in _
+    ]
+
+    for file_name in file_names:
+        path = os.path.join(data_dir, file_name)
+        img = read_single_array_from_pil(path)
+        tgt_name = file_name.replace('.jpg', img_suffix)
+        tgt_name = p + '_' + tgt_name
+        if 'Seg' in file_name:  # is a mask
+            tgt_name = tgt_name.replace('_HGE_Seg', '')
+            mask_path = os.path.join(tgt_mask_dir, tgt_name)
+            mask = convert_label(img, convert_dict={0: 0, 255: 1})
+            save_png_from_array(mask, mask_path)
+        else:
+            img_path = os.path.join(tgt_img_dir, tgt_name)
+            pil = Image.fromarray(img).convert('RGB')
+            pil.save(img_path)
+
+            if file_name in img_wo_mask_names:
+                mask = np.zeros_like(img, dtype=np.uint8)
+                mask_path = os.path.join(tgt_mask_dir, tgt_name)
+                save_png_from_array(mask, mask_path)
diff --git a/projects/medical/2d_image/dermoscopy/isic2016_task1/README.md b/projects/medical/2d_image/dermoscopy/isic2016_task1/README.md
new file mode 100644
index 0000000000..6e44e415ed
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2016_task1/README.md
@@ -0,0 +1,149 @@
+# ISIC-2016 Task1
+
+## Description
+
+This project support **`ISIC-2016 Task1 `**, and the dataset used in this project can be downloaded from [here](https://challenge.isic-archive.com/data/#2016).
+
+### Dataset Overview
+
+The overarching goal of the challenge is to develop image analysis tools to enable the automated diagnosis of melanoma from dermoscopic images.
+
+This challenge provides training data (~900 images) for participants to engage in all 3 components of lesion image analysis. A separate test dataset (~350 images) will be provided for participants to generate and submit automated results.
+
+### Original Statistic Information
+
+| Dataset name                                                     | Anatomical region | Task type    | Modality   | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                                |
+| ---------------------------------------------------------------- | ----------------- | ------------ | ---------- | ------------ | --------------------- | ---------------------- | ------------ | ---------------------------------------------------------------------- |
+| [ISIC-2016 Task1](https://challenge.isic-archive.com/data/#2016) | full body         | segmentation | dermoscopy | 2            | 900/-/379-            | yes/-/yes              | 2016         | [CC-0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
+
+| Class Name  | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :---------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background  |    900     |   82.08    |    -     |    -     |    379    |   81.98   |
+| skin lesion |    900     |   17.92    |    -     |    -     |    379    |   18.02   |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/dermoscopy/isic2016_task1/isic2016_task1.png)
+
+### Prerequisites
+
+- Python 3.8
+- PyTorch 1.10.0
+- pillow(PIL) 9.3.0
+- scikit-learn(sklearn) 1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of PYTHONPATH, which should point to the project's directory so that Python can locate the module files. In isic2016_task1/ root directory, run the following line to add the current directory to PYTHONPATH:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://challenge.isic-archive.com/data/#2016) and decompression data to path 'data/'.
+- run script `"python tools/prepare_dataset.py"` to split dataset and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── dermoscopy
+  │   │   │   │   ├── isic2016_task1
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── test.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   │   ├── test
+  │   │   │   │   |   │   │   │   ├── yyy.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── yyy.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   │   ├── test
+  │   │   │   │   |   │   │   │   ├── yyy.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── yyy.png
+```
+
+### Training commands
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}
+```
+
+To train on multiple GPUs, e.g. 8 GPUs, run the following command:
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}  --launcher pytorch --gpus 8
+```
+
+### Testing commands
+
+```shell
+mim test mmseg ./configs/${CONFIG_PATH}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Results
+
+### ISIC-2016 Task1
+
+|     Method      | Backbone | Crop Size |   lr   | mIoU | mDice |                                                                                             config                                                                                              |
+| :-------------: | :------: | :-------: | :----: | :--: | :---: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| fcn_unet_s5-d16 |   unet   |  512x512  |  0.01  |  -   |   -   |  [config](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_isic2016-task1-512x512.py)  |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.001  |  -   |   -   | [config](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_isic2016-task1-512x512.py)  |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.0001 |  -   |   -   | [config](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_isic2016-task1-512x512.py) |
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_isic2016-task1-512x512.py b/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_isic2016-task1-512x512.py
new file mode 100644
index 0000000000..5638de4d56
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_isic2016-task1-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './isic2016-task1_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.isic2016-task1_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_isic2016-task1-512x512.py b/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_isic2016-task1-512x512.py
new file mode 100644
index 0000000000..bf17faa538
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_isic2016-task1-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './isic2016-task1_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.isic2016-task1_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_isic2016-task1-512x512.py b/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_isic2016-task1-512x512.py
new file mode 100644
index 0000000000..f7bfcf6158
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_isic2016-task1-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './isic2016-task1_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.isic2016-task1_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/isic2016-task1_512x512.py b/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/isic2016-task1_512x512.py
new file mode 100644
index 0000000000..029f5d4d7e
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2016_task1/configs/isic2016-task1_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'ISIC2017Task1'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='test.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/dermoscopy/isic2016_task1/datasets/isic2016-task1_dataset.py b/projects/medical/2d_image/dermoscopy/isic2016_task1/datasets/isic2016-task1_dataset.py
new file mode 100644
index 0000000000..8f11bdd0ba
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2016_task1/datasets/isic2016-task1_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ISIC2017Task1(BaseSegDataset):
+    """ISIC2017Task1 dataset.
+
+    In segmentation map annotation for ISIC2017Task1,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('normal', 'skin lesion'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/dermoscopy/isic2016_task1/tools/prepare_dataset.py b/projects/medical/2d_image/dermoscopy/isic2016_task1/tools/prepare_dataset.py
new file mode 100755
index 0000000000..ef4dad5408
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2016_task1/tools/prepare_dataset.py
@@ -0,0 +1,120 @@
+import glob
+import os
+import shutil
+
+import numpy as np
+from PIL import Image
+
+
+def check_maskid(train_imgs):
+    for i in train_masks:
+        img = Image.open(i)
+        print(np.unique(np.array(img)))
+
+
+def reformulate_file(image_list, mask_list):
+    file_list = []
+    for idx, (imgp,
+              maskp) in enumerate(zip(sorted(image_list), sorted(mask_list))):
+        item = {'image': imgp, 'label': maskp}
+        file_list.append(item)
+    return file_list
+
+
+def check_file_exist(pair_list):
+    rel_path = os.getcwd()
+    for idx, sample in enumerate(pair_list):
+        image_path = sample['image']
+        assert os.path.exists(os.path.join(rel_path, image_path))
+        if 'label' in sample:
+            mask_path = sample['label']
+            assert os.path.exists(os.path.join(rel_path, mask_path))
+    print('all file path ok!')
+
+
+def convert_maskid(mask):
+    # add mask id conversion
+    arr_mask = np.array(mask).astype(np.uint8)
+    arr_mask[arr_mask == 255] = 1
+    return Image.fromarray(arr_mask)
+
+
+def process_dataset(file_lists, part_dir_dict):
+    for ith, part in enumerate(file_lists):
+        part_dir = part_dir_dict[ith]
+        for sample in part:
+            # read image and mask
+            image_path = sample['image']
+            if 'label' in sample:
+                mask_path = sample['label']
+
+            basename = os.path.basename(image_path)
+            targetname = basename.split('.')[0]  # from image name
+
+            # check image file
+            img_save_path = os.path.join(root_path, 'images', part_dir,
+                                         targetname + save_img_suffix)
+            if not os.path.exists(img_save_path):
+                if not image_path.endswith('.png'):
+                    src = Image.open(image_path)
+                    src.save(img_save_path)
+                else:
+                    shutil.copy(image_path, img_save_path)
+
+            if mask_path is not None:
+                mask_save_path = os.path.join(root_path, 'masks', part_dir,
+                                              targetname + save_seg_map_suffix)
+                if not os.path.exists(mask_save_path):
+                    # check mask file
+                    mask = Image.open(mask_path).convert('L')
+                    # convert mask id
+                    mask = convert_maskid(mask)
+                    if not mask_path.endswith('.png'):
+                        mask.save(mask_save_path)
+                    else:
+                        mask.save(mask_save_path)
+
+        # print image num
+        part_dir_folder = os.path.join(root_path, 'images', part_dir)
+        print(
+            f'{part_dir} has {len(os.listdir(part_dir_folder))} images completed!'  # noqa
+        )
+
+
+if __name__ == '__main__':
+
+    root_path = 'data/'  # original file
+    img_suffix = '.jpg'
+    seg_map_suffix = '.png'
+    save_img_suffix = '.png'
+    save_seg_map_suffix = '.png'
+
+    train_imgs = glob.glob('data/ISBI2016_ISIC_Part1_Training_Data/*'  # noqa
+                           + img_suffix)
+    train_masks = glob.glob(
+        'data/ISBI2016_ISIC_Part1_Training_GroundTruth/*'  # noqa
+        + seg_map_suffix)
+
+    test_imgs = glob.glob('data/ISBI2016_ISIC_Part1_Test_Data/*' + img_suffix)
+    test_masks = glob.glob(
+        'data/ISBI2016_ISIC_Part1_Test_GroundTruth/*'  # noqa
+        + seg_map_suffix)
+
+    assert len(train_imgs) == len(train_masks)
+    assert len(test_imgs) == len(test_masks)
+
+    print(f'training images: {len(train_imgs)}, test images: {len(test_imgs)}')
+
+    os.system('mkdir -p ' + root_path + 'images/train/')
+    os.system('mkdir -p ' + root_path + 'images/test/')
+    os.system('mkdir -p ' + root_path + 'masks/train/')
+    os.system('mkdir -p ' + root_path + 'masks/test/')
+
+    train_pair_list = reformulate_file(train_imgs, train_masks)
+    test_pair_list = reformulate_file(test_imgs, test_masks)
+
+    check_file_exist(train_pair_list)
+    check_file_exist(test_pair_list)
+
+    part_dir_dict = {0: 'train/', 1: 'test/'}
+    process_dataset([train_pair_list, test_pair_list], part_dir_dict)
diff --git a/projects/medical/2d_image/dermoscopy/isic2017_task1/README.md b/projects/medical/2d_image/dermoscopy/isic2017_task1/README.md
new file mode 100644
index 0000000000..c7cc27096b
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2017_task1/README.md
@@ -0,0 +1,158 @@
+# ISIC-2017 Task1
+
+## Description
+
+This project support **`ISIC-2017 Task1 `**, and the dataset used in this project can be downloaded from [here](https://challenge.isic-archive.com/data/#2017).
+
+### Dataset Overview
+
+The goal of the challenge is to help participants develop image analysis tools to enable the automated diagnosis of melanoma from dermoscopic images.
+
+This challenge provides training data (~2000 images) for participants to engage in all 3 components of lesion image analysis. A separate public validation dataset (~150 images) and blind held-out test dataset (~600 images) will be provided for participants to generate and submit automated results.
+
+### Original Statistic Information
+
+| Dataset name                                                     | Anatomical region | Task type    | Modality   | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                                |
+| ---------------------------------------------------------------- | ----------------- | ------------ | ---------- | ------------ | --------------------- | ---------------------- | ------------ | ---------------------------------------------------------------------- |
+| [ISIC-2017 Task1](https://challenge.isic-archive.com/data/#2017) | full body         | segmentation | dermoscopy | 2            | 2000/150/600          | yes/yes/yes            | 2017         | [CC-0](https://creativecommons.org/share-your-work/public-domain/cc0/) |
+
+| Class Name  | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :---------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|   normal    |    2000    |   82.86    |   150    |  73.88   |    600    |   70.62   |
+| skin lesion |    2000    |   17.14    |   150    |  26.12   |    600    |   29.38   |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/dermoscopy/isic2017_task1/isic2017_task1.png)
+
+### Prerequisites
+
+- Python 3.8
+- PyTorch 1.10.0
+- pillow(PIL) 9.3.0
+- scikit-learn(sklearn) 1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of PYTHONPATH, which should point to the project's directory so that Python can locate the module files. In isic2017_task1/ root directory, run the following line to add the current directory to PYTHONPATH:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://challenge.isic-archive.com/data/#2017) and decompression data to path 'data/'.
+- run script `"python tools/prepare_dataset.py"` to split dataset and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── dermoscopy
+  │   │   │   │   ├── isic2017_task1
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── test.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   │   ├── val
+  │   │   │   │   |   │   │   │   ├── yyy.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── yyy.png
+  │   │   │   │   │   │   │   ├── test
+  │   │   │   │   |   │   │   │   ├── yyy.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── yyy.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   │   ├── val
+  │   │   │   │   |   │   │   │   ├── yyy.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── yyy.png
+  │   │   │   │   │   │   │   ├── test
+  │   │   │   │   |   │   │   │   ├── yyy.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── yyy.png
+```
+
+### Training commands
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}
+```
+
+To train on multiple GPUs, e.g. 8 GPUs, run the following command:
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}  --launcher pytorch --gpus 8
+```
+
+### Testing commands
+
+```shell
+mim test mmseg ./configs/${CONFIG_PATH}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Results
+
+### ISIC-2017 Task1
+
+|     Method      | Backbone | Crop Size |   lr   | mIoU | mDice |                                                                                             config                                                                                              |
+| :-------------: | :------: | :-------: | :----: | :--: | :---: | :---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+| fcn_unet_s5-d16 |   unet   |  512x512  |  0.01  |  -   |   -   |  [config](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_isic2017-task1-512x512.py)  |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.001  |  -   |   -   | [config](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_isic2017-task1-512x512.py)  |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.0001 |  -   |   -   | [config](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_isic2017-task1-512x512.py) |
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [ ] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_isic2017-task1-512x512.py b/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_isic2017-task1-512x512.py
new file mode 100644
index 0000000000..58d0a125d3
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_isic2017-task1-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './isic2017-task1_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.isic2017-task1_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_isic2017-task1-512x512.py b/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_isic2017-task1-512x512.py
new file mode 100644
index 0000000000..3becacf64f
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_isic2017-task1-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './isic2017-task1_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.isic2017-task1_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_isic2017-task1-512x512.py b/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_isic2017-task1-512x512.py
new file mode 100644
index 0000000000..654ef4dc3d
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_isic2017-task1-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './isic2017-task1_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.isic2017-task1_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/isic2017-task1_512x512.py b/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/isic2017-task1_512x512.py
new file mode 100644
index 0000000000..95997a1099
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2017_task1/configs/isic2017-task1_512x512.py
@@ -0,0 +1,41 @@
+dataset_type = 'ISIC2017Task1'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/train/', seg_map_path='masks/train/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(img_path='images/val/', seg_map_path='masks/val/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/dermoscopy/isic2017_task1/datasets/isic2017-task1_dataset.py b/projects/medical/2d_image/dermoscopy/isic2017_task1/datasets/isic2017-task1_dataset.py
new file mode 100644
index 0000000000..8f11bdd0ba
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2017_task1/datasets/isic2017-task1_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ISIC2017Task1(BaseSegDataset):
+    """ISIC2017Task1 dataset.
+
+    In segmentation map annotation for ISIC2017Task1,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('normal', 'skin lesion'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/dermoscopy/isic2017_task1/tools/prepare_dataset.py b/projects/medical/2d_image/dermoscopy/isic2017_task1/tools/prepare_dataset.py
new file mode 100755
index 0000000000..b3643c9359
--- /dev/null
+++ b/projects/medical/2d_image/dermoscopy/isic2017_task1/tools/prepare_dataset.py
@@ -0,0 +1,127 @@
+import glob
+import os
+import shutil
+
+import numpy as np
+from PIL import Image
+
+
+def check_maskid(train_imgs):
+    for i in train_masks:
+        img = Image.open(i)
+        print(np.unique(np.array(img)))
+
+
+def reformulate_file(image_list, mask_list):
+    file_list = []
+    for idx, (imgp,
+              maskp) in enumerate(zip(sorted(image_list), sorted(mask_list))):
+        item = {'image': imgp, 'label': maskp}
+        file_list.append(item)
+    return file_list
+
+
+def convert_maskid(mask):
+    # add mask id conversion
+    arr_mask = np.array(mask).astype(np.uint8)
+    arr_mask[arr_mask == 255] = 1
+    return Image.fromarray(arr_mask)
+
+
+def check_file_exist(pair_list):
+    rel_path = os.getcwd()
+    for idx, sample in enumerate(pair_list):
+        image_path = sample['image']
+        assert os.path.exists(os.path.join(rel_path, image_path))
+        if 'label' in sample:
+            mask_path = sample['label']
+            assert os.path.exists(os.path.join(rel_path, mask_path))
+    print('all file path ok!')
+
+
+def process_dataset(file_lists, part_dir_dict):
+    for ith, part in enumerate(file_lists):
+        part_dir = part_dir_dict[ith]
+        for sample in part:
+            # read image and mask
+            image_path = sample['image']
+            if 'label' in sample:
+                mask_path = sample['label']
+
+            basename = os.path.basename(image_path)
+            targetname = basename.split('.')[0]  # from image name
+
+            # check image file
+            img_save_path = os.path.join(root_path, 'images', part_dir,
+                                         targetname + save_img_suffix)
+            if not os.path.exists(img_save_path):
+                if not image_path.endswith('.png'):
+                    src = Image.open(image_path)
+                    src.save(img_save_path)
+                else:
+                    shutil.copy(image_path, img_save_path)
+
+            if mask_path is not None:
+                mask_save_path = os.path.join(root_path, 'masks', part_dir,
+                                              targetname + save_seg_map_suffix)
+                if not os.path.exists(mask_save_path):
+                    # check mask file
+                    mask = Image.open(mask_path).convert('L')
+                    # convert mask id
+                    mask = convert_maskid(mask)
+                    if not mask_path.endswith('.png'):
+                        mask.save(mask_save_path)
+                    else:
+                        mask.save(mask_save_path)
+
+        # print image num
+        part_dir_folder = os.path.join(root_path, 'images', part_dir)
+        print(
+            f'{part_dir} has {len(os.listdir(part_dir_folder))} images completed!'  # noqa
+        )
+
+
+if __name__ == '__main__':
+
+    root_path = 'data/'  # original file
+    img_suffix = '.jpg'
+    seg_map_suffix = '.png'
+    save_img_suffix = '.png'
+    save_seg_map_suffix = '.png'
+
+    train_imgs = glob.glob('data/ISIC-2017_Training_Data/*' + img_suffix)
+    train_masks = glob.glob('data/ISIC-2017_Training_Part1_GroundTruth/*' +
+                            seg_map_suffix)
+
+    val_imgs = glob.glob('data/ISIC-2017_Validation_Data/*' + img_suffix)
+    val_masks = glob.glob('data/ISIC-2017_Validation_Part1_GroundTruth/*' +
+                          seg_map_suffix)
+
+    test_imgs = glob.glob('data/ISIC-2017_Test_v2_Data/*' + img_suffix)
+    test_masks = glob.glob('data/ISIC-2017_Test_v2_Part1_GroundTruth/*' +
+                           seg_map_suffix)
+
+    assert len(train_imgs) == len(train_masks)
+    assert len(val_imgs) == len(val_masks)
+    assert len(test_imgs) == len(test_masks)
+
+    os.system('mkdir -p ' + root_path + 'images/train/')
+    os.system('mkdir -p ' + root_path + 'images/val/')
+    os.system('mkdir -p ' + root_path + 'images/test/')
+    os.system('mkdir -p ' + root_path + 'masks/train/')
+    os.system('mkdir -p ' + root_path + 'masks/val/')
+    os.system('mkdir -p ' + root_path + 'masks/test/')
+
+    part_dir_dict = {0: 'train/', 1: 'val/', 2: 'test/'}
+
+    train_pair_list = reformulate_file(train_imgs, train_masks)
+    val_pair_list = reformulate_file(val_imgs, val_masks)
+    test_pair_list = reformulate_file(test_imgs, test_masks)
+
+    check_file_exist(train_pair_list)
+    check_file_exist(val_pair_list)
+    check_file_exist(test_pair_list)
+
+    part_dir_dict = {0: 'train/', 1: 'val/', 2: 'test/'}
+    process_dataset([train_pair_list, val_pair_list, test_pair_list],
+                    part_dir_dict)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg/README.md b/projects/medical/2d_image/endoscopy/kvasir_seg/README.md
new file mode 100644
index 0000000000..ea597bc440
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg/README.md
@@ -0,0 +1,145 @@
+# Kvasir-Sessile Dataset (Kvasir SEG)
+
+## Description
+
+This project supports **`Kvasir-Sessile Dataset (Kvasir SEG) `**, which can be downloaded from [here](https://opendatalab.com/Kvasir-Sessile_dataset).
+
+## Dataset Overview
+
+The Kvasir-SEG dataset contains polyp images and their corresponding ground truth from the Kvasir Dataset v2. The resolution of the images contained in Kvasir-SEG varies from 332x487 to 1920x1072 pixels.
+
+<!-- For a typical model, this section should contain the commands for training and testing. You are also suggested to dump your environment specification to env.yml by `conda env export > env.yml`. -->
+
+### Information Statistics
+
+| Dataset Name                                                  | Anatomical Region | Task Type    | Modality  | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                   |
+| ------------------------------------------------------------- | ----------------- | ------------ | --------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------- |
+| [Kvarsir-SEG](https://opendatalab.com/Kvasir-Sessile_dataset) | abdomen           | segmentation | endoscopy | 2            | 196/-/-               | yes/-/-                | 2020         | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    196     |   92.31    |    -     |    -     |     -     |     -     |
+|   polyp    |    196     |    7.69    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![kvasir-seg](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/endoscopy_images/kvasir_seg/kvasir_seg_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@inproceedings{jha2020kvasir,
+	title={Kvasir-seg: A segmented polyp dataset},
+	author={Jha, Debesh and Smedsrud, Pia H and Riegler, Michael A and Halvorsen, P{\aa}l and Lange, Thomas de and Johansen, Dag and Johansen, H{\aa}vard D},
+	booktitle={International Conference on Multimedia Modeling},
+	pages={451--462},
+	year={2020},
+	organization={Springer}
+ }
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `kvasir_seg/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://opendatalab.com/Kvasir-Sessile_dataset) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── endoscopy
+  │   │   │   │   ├── kvasir_seg
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    156     |   92.28    |    40    |  92.41   |     -     |     -     |
+|   polyp    |    156     |    7.72    |    40    |   7.59   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg .configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_kvasir-seg-512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_kvasir-seg-512x512.py
new file mode 100644
index 0000000000..145d5a7a17
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_kvasir-seg-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './kvasir-seg_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.kvasir-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_kvasir-seg-512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_kvasir-seg-512x512.py
new file mode 100644
index 0000000000..3ea05c5109
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_kvasir-seg-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './kvasir-seg_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.kvasir-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_kvasir-seg-512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_kvasir-seg-512x512.py
new file mode 100644
index 0000000000..7e064a716a
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_kvasir-seg-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './kvasir-seg_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.kvasir-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_kvasir-seg-512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_kvasir-seg-512x512.py
new file mode 100644
index 0000000000..0fc1d6e99d
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_kvasir-seg-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './kvasir-seg_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.kvasir-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg/configs/kvasir-seg_512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/kvasir-seg_512x512.py
new file mode 100644
index 0000000000..e8b2467f8c
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg/configs/kvasir-seg_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'KvasirSEGDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg/datasets/kvasir-seg_dataset.py b/projects/medical/2d_image/endoscopy/kvasir_seg/datasets/kvasir-seg_dataset.py
new file mode 100644
index 0000000000..9d601328eb
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg/datasets/kvasir-seg_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class KvasirSEGDataset(BaseSegDataset):
+    """KvasirSEGDataset dataset.
+
+    In segmentation map annotation for KvasirSEGDataset, 0 stands for
+    background, which is included in 2 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix`` is
+    fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False..
+    """
+    METAINFO = dict(classes=('background', 'polyp'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg/tools/prepare_dataset.py b/projects/medical/2d_image/endoscopy/kvasir_seg/tools/prepare_dataset.py
new file mode 100644
index 0000000000..74c43e9635
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg/tools/prepare_dataset.py
@@ -0,0 +1,87 @@
+import glob
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.jpg'
+seg_map_suffix = '.jpg'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+tgt_img_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_dir = os.path.join(root_path, 'masks/train/')
+os.system('mkdir -p ' + tgt_img_dir)
+os.system('mkdir -p ' + tgt_mask_dir)
+
+
+def filter_suffix_recursive(src_dir, suffix):
+    # filter out file names and paths in source directory
+    suffix = '.' + suffix if '.' not in suffix else suffix
+    file_paths = glob.glob(
+        os.path.join(src_dir, '**', '*' + suffix), recursive=True)
+    file_names = [_.split('/')[-1] for _ in file_paths]
+    return sorted(file_paths), sorted(file_names)
+
+
+def convert_label(img, convert_dict):
+    arr = np.zeros_like(img, dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr[img == c] = i
+    return arr
+
+
+def convert_pics_into_pngs(src_dir, tgt_dir, suffix, convert='RGB'):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_img_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+        num = len(src_paths)
+        img = np.array(Image.open(src_path))
+        if len(img.shape) == 2:
+            pil = Image.fromarray(img).convert(convert)
+        elif len(img.shape) == 3:
+            pil = Image.fromarray(img)
+        else:
+            raise ValueError('Input image not 2D/3D: ', img.shape)
+
+        pil.save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+def convert_label_pics_into_pngs(src_dir,
+                                 tgt_dir,
+                                 suffix,
+                                 convert_dict={
+                                     0: 0,
+                                     255: 1
+                                 }):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    num = len(src_paths)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_seg_map_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+
+        img = np.array(Image.open(src_path))
+        img = convert_label(img, convert_dict)
+        Image.fromarray(img).save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+if __name__ == '__main__':
+
+    convert_pics_into_pngs(
+        os.path.join(root_path, 'sessile-main-Kvasir-SEG/images'),
+        tgt_img_dir,
+        suffix=img_suffix)
+
+    convert_label_pics_into_pngs(
+        os.path.join(root_path, 'sessile-main-Kvasir-SEG/masks'),
+        tgt_mask_dir,
+        suffix=seg_map_suffix)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/README.md b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/README.md
new file mode 100644
index 0000000000..80eb00f51b
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/README.md
@@ -0,0 +1,145 @@
+# Kvasir-SEG Segmented Polyp Dataset from Aliyun (Kvasir SEG Aliyun)
+
+## Description
+
+This project supports **`Kvasir-SEG Segmented Polyp Dataset from Aliyun (Kvasir SEG Aliyun) `**, which can be downloaded from [here](https://tianchi.aliyun.com/dataset/84385).
+
+### Dataset Overview
+
+Colorectal cancer is the second most common cancer type among women and third most common among men. Polyps are precursors to colorectal cancer and therefore important to detect and remove at an early stage. Polyps are found in nearly half of the individuals at age 50 that undergo a colonoscopy screening, and their frequency increase with age.Polyps are abnormal tissue growth from the mucous membrane, which is lining the inside of the GI tract, and can sometimes be cancerous. Colonoscopy is the gold standard for detection and assessment of these polyps with subsequent biopsy and removal of the polyps. Early disease detection has a huge impact on survival from colorectal cancer. Increasing the detection of polyps has been shown to decrease risk of colorectal cancer. Thus, automatic detection of more polyps at an early stage can play a crucial role in prevention and survival from colorectal cancer.
+
+The Kvasir-SEG dataset is based on the previous Kvasir dataset, which is the first multi-class dataset for gastrointestinal (GI) tract disease detection and classification. It contains annotated polyp images and their corresponding masks. The pixels depicting polyp tissue, the ROI, are represented by the foreground (white mask), while the background (in black) does not contain positive pixels. These images were collected and verified by experienced gastroenterologists from Vestre Viken Health Trust in Norway. The classes include anatomical landmarks, pathological findings and endoscopic procedures.
+
+### Information Statistics
+
+| Dataset Name                                           | Anatomical Region | Task Type    | Modality  | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                   |
+| ------------------------------------------------------ | ----------------- | ------------ | --------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------- |
+| [kvasir-seg](https://tianchi.aliyun.com/dataset/84385) | abdomen           | segmentation | endoscopy | 2            | 1000/-/-              | yes/-/-                | 2020         | [CC-BY 4.0](https://creativecommons.org/licenses/by/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    1000    |   84.72    |    -     |    -     |     -     |     -     |
+|   polyp    |    1000    |   15.28    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![kvasir_seg_aliyun](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/endoscopy_images/kvasir_seg_aliyun/kvasir_seg_aliyun_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@inproceedings{jha2020kvasir,
+	title={Kvasir-seg: A segmented polyp dataset},
+	author={Jha, Debesh and Smedsrud, Pia H and Riegler, Michael A and Halvorsen, P{\aa}l and Lange, Thomas de and Johansen, Dag and Johansen, H{\aa}vard D},
+	booktitle={International Conference on Multimedia Modeling},
+	pages={451--462},
+	year={2020},
+	organization={Springer}
+ }
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `kvasir_seg_aliyun/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://tianchi.aliyun.com/dataset/84385) and decompression data to path 'data/.'.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── endoscopy
+  │   │   │   │   ├── kvasir_seg_aliyun
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    800     |   84.66    |   200    |  84.94   |     -     |     -     |
+|   polyp    |    800     |   15.34    |   200    |  15.06   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_kvasir-seg-aliyun-512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_kvasir-seg-aliyun-512x512.py
new file mode 100644
index 0000000000..b59db95232
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_kvasir-seg-aliyun-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './kvasir-seg-aliyun_512x512.py', 'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.kvasir-seg-aliyun_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_kvasir-seg-aliyun-512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_kvasir-seg-aliyun-512x512.py
new file mode 100644
index 0000000000..6c526680cd
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_kvasir-seg-aliyun-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './kvasir-seg-aliyun_512x512.py', 'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.kvasir-seg-aliyun_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_kvasir-seg-aliyun-512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_kvasir-seg-aliyun-512x512.py
new file mode 100644
index 0000000000..a192a5bd24
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_kvasir-seg-aliyun-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './kvasir-seg-aliyun_512x512.py', 'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.kvasir-seg-aliyun_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_kvasir-seg-aliyun-512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_kvasir-seg-aliyun-512x512.py
new file mode 100644
index 0000000000..5325e1f080
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_kvasir-seg-aliyun-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './kvasir-seg-aliyun_512x512.py', 'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.kvasir-seg-aliyun_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/kvasir-seg-aliyun_512x512.py b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/kvasir-seg-aliyun_512x512.py
new file mode 100644
index 0000000000..5f86880467
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/configs/kvasir-seg-aliyun_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'KvasirSEGAliyunDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/datasets/kvasir-seg-aliyun_dataset.py b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/datasets/kvasir-seg-aliyun_dataset.py
new file mode 100644
index 0000000000..198caf07bc
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/datasets/kvasir-seg-aliyun_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class KvasirSEGAliyunDataset(BaseSegDataset):
+    """KvasirSEGAliyunDataset dataset.
+
+    In segmentation map annotation for KvasirSEGAliyunDataset,
+    0 stands for  background,which is included in 2 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False..
+    """
+    METAINFO = dict(classes=('background', 'polyp'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/tools/prepare_dataset.py b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/tools/prepare_dataset.py
new file mode 100644
index 0000000000..b230e7fef5
--- /dev/null
+++ b/projects/medical/2d_image/endoscopy/kvasir_seg_aliyun/tools/prepare_dataset.py
@@ -0,0 +1,86 @@
+import glob
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.jpg'
+seg_map_suffix = '.jpg'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+tgt_img_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_dir = os.path.join(root_path, 'masks/train/')
+os.system('mkdir -p ' + tgt_img_dir)
+os.system('mkdir -p ' + tgt_mask_dir)
+
+
+def filter_suffix_recursive(src_dir, suffix):
+    # filter out file names and paths in source directory
+    suffix = '.' + suffix if '.' not in suffix else suffix
+    file_paths = glob.glob(
+        os.path.join(src_dir, '**', '*' + suffix), recursive=True)
+    file_names = [_.split('/')[-1] for _ in file_paths]
+    return sorted(file_paths), sorted(file_names)
+
+
+def convert_label(img, convert_dict):
+    arr = np.zeros_like(img, dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr[img == c] = i
+    return arr
+
+
+def convert_pics_into_pngs(src_dir, tgt_dir, suffix, convert='RGB'):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_img_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+        num = len(src_paths)
+        img = np.array(Image.open(src_path))
+        if len(img.shape) == 2:
+            pil = Image.fromarray(img).convert(convert)
+        elif len(img.shape) == 3:
+            pil = Image.fromarray(img)
+        else:
+            raise ValueError('Input image not 2D/3D: ', img.shape)
+
+        pil.save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+def convert_label_pics_into_pngs(src_dir,
+                                 tgt_dir,
+                                 suffix,
+                                 convert_dict={
+                                     0: 0,
+                                     255: 1
+                                 }):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    num = len(src_paths)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_seg_map_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+
+        img = np.array(Image.open(src_path).convert('L'))
+        img = convert_label(img, convert_dict)
+        Image.fromarray(img).save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+if __name__ == '__main__':
+    convert_pics_into_pngs(
+        os.path.join(root_path, 'Kvasir-SEG/images'),
+        tgt_img_dir,
+        suffix=img_suffix)
+
+    convert_label_pics_into_pngs(
+        os.path.join(root_path, 'Kvasir-SEG/masks'),
+        tgt_mask_dir,
+        suffix=seg_map_suffix)
diff --git a/projects/medical/2d_image/fluorescein_angriogram/vampire/README.md b/projects/medical/2d_image/fluorescein_angriogram/vampire/README.md
new file mode 100644
index 0000000000..c2c61c46a0
--- /dev/null
+++ b/projects/medical/2d_image/fluorescein_angriogram/vampire/README.md
@@ -0,0 +1,158 @@
+# Vessel Assessment and Measurement Platform for Images of the REtina
+
+## Description
+
+This project support **`Vessel Assessment and Measurement Platform for Images of the REtina`**, and the dataset used in this project can be downloaded from [here](https://vampire.computing.dundee.ac.uk/vesselseg.html).
+
+### Dataset Overview
+
+In order to promote evaluation of vessel segmentation on ultra-wide field-of-view (UWFV) fluorescein angriogram (FA) frames, we make public 8 frames from two different sequences, the manually annotated images and the result of our automatic vessel segmentation algorithm.
+
+### Original Statistic Information
+
+| Dataset name                                                     | Anatomical region | Task type    | Modality               | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                         |
+| ---------------------------------------------------------------- | ----------------- | ------------ | ---------------------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------- |
+| [Vampire](https://vampire.computing.dundee.ac.uk/vesselseg.html) | vessel            | segmentation | fluorescein angriogram | 2            | 8/-/-                 | yes/-/-                | 2017         | [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |     8      |   96.75    |    -     |    -     |     -     |     -     |
+|   vessel   |     8      |    3.25    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/fluorescein_angriogram/vampire/vampire_dataset.png)
+
+## Dataset Citation
+
+```bibtex
+
+@inproceedings{perez2011improving,
+  title={Improving vessel segmentation in ultra-wide field-of-view retinal fluorescein angiograms},
+  author={Perez-Rovira, Adria and Zutis, K and Hubschman, Jean Pierre and Trucco, Emanuele},
+  booktitle={2011 Annual International Conference of the IEEE Engineering in Medicine and Biology Society},
+  pages={2614--2617},
+  year={2011},
+  organization={IEEE}
+}
+
+@article{perez2011rerbee,
+  title={RERBEE: robust efficient registration via bifurcations and elongated elements applied to retinal fluorescein angiogram sequences},
+  author={Perez-Rovira, Adria and Cabido, Raul and Trucco, Emanuele and McKenna, Stephen J and Hubschman, Jean Pierre},
+  journal={IEEE Transactions on Medical Imaging},
+  volume={31},
+  number={1},
+  pages={140--150},
+  year={2011},
+  publisher={IEEE}
+}
+
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `vampire/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://vampire.computing.dundee.ac.uk/vesselseg.html) and decompression data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to split dataset and change folder structure as below.
+- run script `python ../../tools/split_seg_dataset.py` to split dataset. For the Bacteria_detection dataset, as there is no test or validation dataset, we sample 20% samples from the whole dataset as the validation dataset and 80% samples for training data and make two filename lists `train.txt` and `val.txt`. As we set the random seed as the hard code, we eliminated the randomness, the dataset split actually can be reproducible.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── fluorescein_angriogram
+  │   │   │   │   ├── vampire
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |     6      |   97.48    |    2     |  94.54   |     -     |     -     |
+|   vessel   |     6      |    2.52    |    2     |   5.46   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default）
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_PATH}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [ ] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_vampire-512x512.py b/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_vampire-512x512.py
new file mode 100755
index 0000000000..7f5273aaff
--- /dev/null
+++ b/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_vampire-512x512.py
@@ -0,0 +1,19 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './vampire_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.vampire_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_vampire-512x512.py b/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_vampire-512x512.py
new file mode 100755
index 0000000000..4382229989
--- /dev/null
+++ b/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_vampire-512x512.py
@@ -0,0 +1,19 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './vampire_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.vampire_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=dict(size=img_scale),
+    pretrained=None,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_vampire-512x512.py b/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_vampire-512x512.py
new file mode 100755
index 0000000000..8d93e17627
--- /dev/null
+++ b/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_vampire-512x512.py
@@ -0,0 +1,22 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './vampire_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.vampire_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    decode_head=dict(
+        num_classes=2,
+        loss_decode=dict(type='CrossEntropyLoss', use_sigmoid=True),
+        out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/vampire_512x512.py b/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/vampire_512x512.py
new file mode 100755
index 0000000000..4eda92f9f2
--- /dev/null
+++ b/projects/medical/2d_image/fluorescein_angriogram/vampire/configs/vampire_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'VampireDataset'
+data_root = 'data'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/fluorescein_angriogram/vampire/datasets/__init__.py b/projects/medical/2d_image/fluorescein_angriogram/vampire/datasets/__init__.py
new file mode 100755
index 0000000000..93f9cbf050
--- /dev/null
+++ b/projects/medical/2d_image/fluorescein_angriogram/vampire/datasets/__init__.py
@@ -0,0 +1,3 @@
+from .vampire_dataset import VampireDataset
+
+__all__ = ['VampireDataset']
diff --git a/projects/medical/2d_image/fluorescein_angriogram/vampire/datasets/vampire_dataset.py b/projects/medical/2d_image/fluorescein_angriogram/vampire/datasets/vampire_dataset.py
new file mode 100755
index 0000000000..4d38040f7f
--- /dev/null
+++ b/projects/medical/2d_image/fluorescein_angriogram/vampire/datasets/vampire_dataset.py
@@ -0,0 +1,28 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class VampireDataset(BaseSegDataset):
+    """VampireDataset dataset.
+
+    In segmentation map annotation for VampireDataset, 0 stands for background,
+    which is included in 2 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is
+    fixed to '.png'.
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+    """
+    METAINFO = dict(classes=('background', 'vessel'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/fluorescein_angriogram/vampire/tools/prepare_dataset.py b/projects/medical/2d_image/fluorescein_angriogram/vampire/tools/prepare_dataset.py
new file mode 100644
index 0000000000..2755b5d28b
--- /dev/null
+++ b/projects/medical/2d_image/fluorescein_angriogram/vampire/tools/prepare_dataset.py
@@ -0,0 +1,44 @@
+import os
+import shutil
+
+from PIL import Image
+
+path = 'data'
+
+if not os.path.exists(os.path.join(path, 'images', 'train')):
+    os.system(f'mkdir -p {os.path.join(path, "images", "train")}')
+
+if not os.path.exists(os.path.join(path, 'masks', 'train')):
+    os.system(f'mkdir -p {os.path.join(path, "masks", "train")}')
+
+origin_data_path = os.path.join(path, 'vesselSegmentation')
+
+imgs_amd14 = os.listdir(os.path.join(origin_data_path, 'AMD14'))
+imgs_ger7 = os.listdir(os.path.join(origin_data_path, 'GER7'))
+
+for img in imgs_amd14:
+    shutil.copy(
+        os.path.join(origin_data_path, 'AMD14', img),
+        os.path.join(path, 'images', 'train', img))
+    # copy GT
+    img_gt = img.replace('.png', '-GT.png')
+    shutil.copy(
+        os.path.join(origin_data_path, 'AMD14-GT', f'{img_gt}'),
+        os.path.join(path, 'masks', 'train', img))
+
+for img in imgs_ger7:
+    shutil.copy(
+        os.path.join(origin_data_path, 'GER7', img),
+        os.path.join(path, 'images', 'train', img))
+    # copy GT
+    img_gt = img.replace('.bmp', '-GT.png')
+    img = img.replace('bmp', 'png')
+    shutil.copy(
+        os.path.join(origin_data_path, 'GER7-GT', img_gt),
+        os.path.join(path, 'masks', 'train', img))
+
+imgs = os.listdir(os.path.join(path, 'images', 'train'))
+for img in imgs:
+    if not img.endswith('.png'):
+        im = Image.open(os.path.join(path, 'images', 'train', img))
+        im.save(os.path.join(path, 'images', 'train', img[:-4] + '.png'))
diff --git a/projects/medical/2d_image/fundus_photography/dr_hagis/README.md b/projects/medical/2d_image/fundus_photography/dr_hagis/README.md
new file mode 100644
index 0000000000..85d8a3e271
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/dr_hagis/README.md
@@ -0,0 +1,155 @@
+# DR HAGIS: Diabetic Retinopathy, Hypertension, Age-related macular degeneration and Glacuoma ImageS
+
+## Description
+
+This project supports **`DR HAGIS: Diabetic Retinopathy, Hypertension, Age-related macular degeneration and Glacuoma ImageS`**, which can be downloaded from [here](https://paperswithcode.com/dataset/dr-hagis).
+
+### Dataset Overview
+
+The DR HAGIS database has been created to aid the development of vessel extraction algorithms suitable for retinal screening programmes. Researchers are encouraged to test their segmentation algorithms using this database. All thirty-nine fundus images were obtained from a diabetic retinopathy screening programme in the UK. Hence, all images were taken from diabetic patients.
+
+Besides the fundus images, the manual segmentation of the retinal surface vessels is provided by an expert grader. These manually segmented images can be used as the ground truth to compare and assess the automatic vessel extraction algorithms. Masks of the FOV are provided as well to quantify the accuracy of vessel extraction within the FOV only. The images were acquired in different screening centers, therefore reflecting the range of image resolutions, digital cameras and fundus cameras used in the clinic. The fundus images were captured using a Topcon TRC-NW6s, Topcon TRC-NW8 or a Canon CR DGi fundus camera with a horizontal 45 degree field-of-view (FOV). The images are 4752x3168 pixels, 3456x2304 pixels, 3126x2136 pixels, 2896x1944 pixels or 2816x1880 pixels in size.
+
+### Original Statistic Information
+
+| Dataset name                                            | Anatomical region | Task type    | Modality           | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License |
+| ------------------------------------------------------- | ----------------- | ------------ | ------------------ | ------------ | --------------------- | ---------------------- | ------------ | ------- |
+| [DR HAGIS](https://paperswithcode.com/dataset/dr-hagis) | head and neck     | segmentation | fundus photography | 2            | 40/-/-                | yes/-/-                | 2017         | -       |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |     40     |   96.38    |    -     |    -     |     -     |     -     |
+|   vessel   |     40     |    3.62    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/fundus_photography/dr_hagis/dr_hagis_dataset.png)
+
+## Usage
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `dr_hagis/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://paperswithcode.com/dataset/dr-hagis) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── fundus_photography
+  │   │   │   │   ├── dr_hagis
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |     32     |   96.21    |    8     |  97.12   |     -     |     -     |
+|   vessel   |     32     |    3.79    |    8     |   2.88   |     -     |     -     |
+
+### Training commands
+
+Train models on a single server with one GPU.
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+Test models on a single server with one GPU.
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Dataset Citation
+
+If this work is helpful for your research, please consider citing the below paper.
+
+```
+@article{holm2017dr,
+  title={DR HAGIS—a fundus image database for the automatic extraction of retinal surface vessels from diabetic patients},
+  author={Holm, Sven and Russell, Greg and Nourrit, Vincent and McLoughlin, Niall},
+  journal={Journal of Medical Imaging},
+  volume={4},
+  number={1},
+  pages={014503--014503},
+  year={2017},
+  publisher={Society of Photo-Optical Instrumentation Engineers}
+}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [ ] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/fundus_photography/dr_hagis/configs/dr-hagis_512x512.py b/projects/medical/2d_image/fundus_photography/dr_hagis/configs/dr-hagis_512x512.py
new file mode 100644
index 0000000000..93b9638410
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/dr_hagis/configs/dr-hagis_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'DRHAGISDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_dr-hagis-512x512.py b/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_dr-hagis-512x512.py
new file mode 100644
index 0000000000..9d14427c45
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_dr-hagis-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './dr-hagis_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.dr-hagis_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_dr-hagis-512x512.py b/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_dr-hagis-512x512.py
new file mode 100644
index 0000000000..507ec748bf
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_dr-hagis-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './dr-hagis_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.dr-hagis_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_dr-hagis-512x512.py b/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_dr-hagis-512x512.py
new file mode 100644
index 0000000000..092ae00a7d
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/dr_hagis/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_dr-hagis-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './dr-hagis_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.dr-hagis_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/dr_hagis/datasets/dr-hagis_dataset.py b/projects/medical/2d_image/fundus_photography/dr_hagis/datasets/dr-hagis_dataset.py
new file mode 100644
index 0000000000..9659f0b8d7
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/dr_hagis/datasets/dr-hagis_dataset.py
@@ -0,0 +1,27 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class DRHAGISDataset(BaseSegDataset):
+    """DRHAGISDataset dataset.
+
+    In segmentation map annotation for DRHAGISDataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+    """
+    METAINFO = dict(classes=('background', 'vessel'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=False,
+            **kwargs)
diff --git a/projects/medical/2d_image/fundus_photography/dr_hagis/tools/prepare_dataset.py b/projects/medical/2d_image/fundus_photography/dr_hagis/tools/prepare_dataset.py
new file mode 100755
index 0000000000..51f4df7dac
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/dr_hagis/tools/prepare_dataset.py
@@ -0,0 +1,41 @@
+import glob
+import os
+import shutil
+
+import mmengine
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.jpg'
+seg_map_suffix = '_manual_orig.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+x_train = glob.glob(os.path.join('data/DRHAGIS/**/*' + img_suffix))
+
+mmengine.mkdir_or_exist(root_path + 'images/train/')
+mmengine.mkdir_or_exist(root_path + 'masks/train/')
+
+D3_palette = {0: (0, 0, 0), 1: (1, 1, 1)}
+D3_invert_palette = {v: k for k, v in D3_palette.items()}
+D2_255_convert_dict = {0: 0, 255: 1}
+
+part_dir_dict = {0: 'train/', 1: 'val/'}
+for ith, part in enumerate([x_train]):
+    part_dir = part_dir_dict[ith]
+    for img in part:
+        basename = os.path.basename(img)
+        shutil.copy(
+            img, root_path + 'images/' + part_dir + basename.split('.')[0] +
+            save_img_suffix)
+        mask_path = root_path + 'DRHAGIS/Manual_Segmentations/' + basename.split(  # noqa
+            '.')[0] + seg_map_suffix
+        label = np.array(Image.open(mask_path))
+
+        save_mask_path = root_path + 'masks/' + part_dir + basename.split(
+            '.')[0] + save_seg_map_suffix  # noqa
+        mask = np.array(Image.open(mask_path)).astype(np.uint8)
+        mask[mask == 255] = 1
+        mask = Image.fromarray(mask)
+        mask.save(save_mask_path)
diff --git a/projects/medical/2d_image/fundus_photography/gamma3/README.md b/projects/medical/2d_image/fundus_photography/gamma3/README.md
new file mode 100644
index 0000000000..e834508fcb
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/gamma3/README.md
@@ -0,0 +1,167 @@
+# Glaucoma grAding from Multi-Modality imAges Task3
+
+## Description
+
+This project support **`Glaucoma grAding from Multi-Modality imAges Task3`**, and the dataset used in this project can be downloaded from [here](https://aistudio.baidu.com/aistudio/competition/detail/121/0/datasets).
+
+### Dataset Overview
+
+This regular-challenge dataset was provided by Sun Yat-sen Ophthalmic Center, Sun Yat-sen University, Guangzhou, China. The dataset contains 200 fundus color images: 100 pairs in the training set and 100 pairs in the test set.
+
+### Original Statistic Information
+
+| Dataset name                                                                        | Anatomical region | Task type    | Modality        | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                         |
+| ----------------------------------------------------------------------------------- | ----------------- | ------------ | --------------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------- |
+| [GammaTask3](https://aistudio.baidu.com/aistudio/competition/detail/121/0/datasets) | eye               | segmentation | fundus photophy | 3            | 100/-/100             | yes/-/-                | 2021         | [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    100     |   99.02    |    -     |    -     |     -     |     -     |
+| optic disc |    100     |    0.67    |    -     |    -     |     -     |     -     |
+| optic cup  |    100     |    0.31    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/fundus_photography/gamma3/gamma3_dataset.png)
+
+## Dataset Citation
+
+```bibtex
+@article{fu2018joint,
+  title={Joint optic disc and cup segmentation based on multi-label deep network and polar transformation},
+  author={Fu, Huazhu and Cheng, Jun and Xu, Yanwu and Wong, Damon Wing Kee and Liu, Jiang and Cao, Xiaochun},
+  journal={IEEE transactions on medical imaging},
+  volume={37},
+  number={7},
+  pages={1597--1605},
+  year={2018},
+  publisher={IEEE}
+}
+
+@article{sevastopolsky2017optic,
+  title={Optic disc and cup segmentation methods for glaucoma detection with modification of U-Net convolutional neural network},
+  author={Sevastopolsky, Artem},
+  journal={Pattern Recognition and Image Analysis},
+  volume={27},
+  pages={618--624},
+  year={2017},
+  publisher={Springer}
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `gammm3/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://aistudio.baidu.com/aistudio/competition/detail/121/0/datasets) and decompression data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to split dataset and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── fundus_photography
+  │   │   │   │   ├── gamma3
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   │   ├── test
+  │   │   │   │   |   │   │   │   ├── yyy.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── yyy.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |     80     |   99.01    |    20    |  99.07   |     -     |     -     |
+| optic disc |     80     |    0.68    |    20    |   0.63   |     -     |     -     |
+| optic cup  |     80     |    0.32    |    20    |   0.31   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default）
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default）
+
+```shell
+mim test mmseg ./configs/${CONFIG_PATH}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [ ] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_gamma3-512x512.py b/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_gamma3-512x512.py
new file mode 100644
index 0000000000..0daac51e10
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_gamma3-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './gamma3_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.gamma3_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_gamma3-512x512.py b/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_gamma3-512x512.py
new file mode 100644
index 0000000000..8a25cd0d26
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_gamma3-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './gamma3_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.gamma3_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_gamma3-512x512.py b/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_gamma3-512x512.py
new file mode 100644
index 0000000000..ea64843867
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/gamma3/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_gamma3-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './gamma3_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.gamma3_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/gamma3/configs/gamma3_512x512.py b/projects/medical/2d_image/fundus_photography/gamma3/configs/gamma3_512x512.py
new file mode 100644
index 0000000000..d23ab55ca7
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/gamma3/configs/gamma3_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'Gamma3Dataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/fundus_photography/gamma3/datasets/gamma3_dataset.py b/projects/medical/2d_image/fundus_photography/gamma3/datasets/gamma3_dataset.py
new file mode 100644
index 0000000000..56cbdd63e6
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/gamma3/datasets/gamma3_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class Gamma3Dataset(BaseSegDataset):
+    """Gamma3Dataset dataset.
+
+    In segmentation map annotation for Gamma3Dataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('background', 'disc', 'cup'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/fundus_photography/gamma3/tools/prepare_dataset.py b/projects/medical/2d_image/fundus_photography/gamma3/tools/prepare_dataset.py
new file mode 100644
index 0000000000..eb820b6b74
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/gamma3/tools/prepare_dataset.py
@@ -0,0 +1,107 @@
+import glob
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.jpg'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+tgt_img_train_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_train_dir = os.path.join(root_path, 'masks/train/')
+tgt_img_test_dir = os.path.join(root_path, 'images/test/')
+os.system('mkdir -p ' + tgt_img_train_dir)
+os.system('mkdir -p ' + tgt_mask_train_dir)
+os.system('mkdir -p ' + tgt_img_test_dir)
+
+
+def filter_suffix_recursive(src_dir, suffix):
+    # filter out file names and paths in source directory
+    suffix = '.' + suffix if '.' not in suffix else suffix
+    file_paths = glob.glob(
+        os.path.join(src_dir, '**/*' + suffix), recursive=True)
+    file_names = [_.split('/')[-1] for _ in file_paths]
+    return sorted(file_paths), sorted(file_names)
+
+
+def convert_label(img, convert_dict):
+    arr = np.zeros_like(img, dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr[img == c] = i
+    return arr
+
+
+def convert_pics_into_pngs(src_dir, tgt_dir, suffix, convert='RGB'):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_img_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+        num = len(src_paths)
+        img = np.array(Image.open(src_path))
+        if len(img.shape) == 2:
+            pil = Image.fromarray(img).convert(convert)
+        elif len(img.shape) == 3:
+            pil = Image.fromarray(img)
+        else:
+            raise ValueError('Input image not 2D/3D: ', img.shape)
+
+        pil.save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+def convert_label_pics_into_pngs(src_dir,
+                                 tgt_dir,
+                                 suffix,
+                                 convert_dict={
+                                     0: 2,
+                                     128: 1,
+                                     255: 0
+                                 }):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    num = len(src_paths)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_seg_map_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+
+        img = np.array(Image.open(src_path))
+        img = convert_label(img, convert_dict)
+        Image.fromarray(img).save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+if __name__ == '__main__':
+
+    convert_pics_into_pngs(
+        os.path.join(
+            root_path,
+            'task3_disc_cup_segmentation/training/fundus color images/'),
+        tgt_img_train_dir,
+        suffix=img_suffix)
+
+    convert_pics_into_pngs(
+        os.path.join(
+            root_path,
+            'task3_disc_cup_segmentation/testing/fundus color images/'),
+        tgt_img_test_dir,
+        suffix=img_suffix)
+
+    convert_label_pics_into_pngs(
+        os.path.join(root_path,
+                     'task3_disc_cup_segmentation/training/Disc_Cup_Mask/'),
+        tgt_mask_train_dir,
+        suffix=seg_map_suffix,
+        convert_dict={
+            0: 2,
+            128: 1,
+            255: 0
+        })
+    # original: [0, 128, 255] for ['optic cup', 'optic disc', 'background']
+    # converted: [0, 1, 2] for ['background', 'optic disc', 'optic cup']
diff --git a/projects/medical/2d_image/fundus_photography/orvs/README.md b/projects/medical/2d_image/fundus_photography/orvs/README.md
new file mode 100644
index 0000000000..6f09203ac4
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/orvs/README.md
@@ -0,0 +1,140 @@
+# ORVS (Online Retinal image for Vessel Segmentation (ORVS))
+
+## Description
+
+This project supports **`ORVS (Online Retinal image for Vessel Segmentation (ORVS))`**, which can be downloaded from [here](https://opendatalab.org.cn/ORVS).
+
+### Dataset Overview
+
+The ORVS dataset is a newly established collaboration between the Department of Computer Science and the Department of Vision Science at the University of Calgary. The dataset contains 49 images collected from a clinic in Calgary, Canada, consisting of 42 training images and 7 testing images. All images were obtained using a Zeiss Visucam 200 with a 30-degree field of view (FOV). The image size is 1444×1444 pixels with 24 bits per pixel. The images are stored in JPEG format with low compression, which is common in ophthalmic practice. All images were manually traced by an expert who has been working in the field of retinal image analysis and has been trained to mark all pixels belonging to retinal vessels. The Windows Paint 3D tool was used for manual image annotation.
+
+### Original Statistic Information
+
+| Dataset name                                           | Anatomical region | Task type    | Modality           | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License |
+| ------------------------------------------------------ | ----------------- | ------------ | ------------------ | ------------ | --------------------- | ---------------------- | ------------ | ------- |
+| [Bactteria detection](https://opendatalab.org.cn/ORVS) | bacteria          | segmentation | fundus photography | 2            | 130/-/72              | yes/-/yes              | 2020         | -       |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    130     |   94.83    |    -     |    -     |    72     |   94.25   |
+|   vessel   |    130     |    5.17    |    -     |    -     |    72     |   5.75    |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/fundus_photography/orvs/ORVS_dataset.png)
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `orvs/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- Clone this [repository](https://github.com/AbdullahSarhan/ICPRVessels), then move `Vessels-Datasets` to `data/`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── fundus_photography
+  │   │   │   │   ├── orvs
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── test.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Training commands
+
+Train models on a single server with one GPU.
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+Test models on a single server with one GPU.
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Dataset Citation
+
+If this work is helpful for your research, please consider citing the below paper.
+
+```
+@inproceedings{sarhan2021transfer,
+  title={Transfer learning through weighted loss function and group normalization for vessel segmentation from retinal images},
+  author={Sarhan, Abdullah and Rokne, Jon and Alhajj, Reda and Crichton, Andrew},
+  booktitle={2020 25th International Conference on Pattern Recognition (ICPR)},
+  pages={9211--9218},
+  year={2021},
+  organization={IEEE}
+}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [ ] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_orvs-512x512.py b/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_orvs-512x512.py
new file mode 100644
index 0000000000..662f837158
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_orvs-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './orvs_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.orvs_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_orvs-512x512.py b/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_orvs-512x512.py
new file mode 100644
index 0000000000..c47cdb6b24
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_orvs-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './orvs_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.orvs_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_orvs-512x512.py b/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_orvs-512x512.py
new file mode 100644
index 0000000000..1097aade28
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/orvs/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_orvs-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './orvs_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.orvs_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/orvs/configs/orvs_512x512.py b/projects/medical/2d_image/fundus_photography/orvs/configs/orvs_512x512.py
new file mode 100644
index 0000000000..a5594dec38
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/orvs/configs/orvs_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'ORVSDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='test.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/fundus_photography/orvs/datasets/orvs_dataset.py b/projects/medical/2d_image/fundus_photography/orvs/datasets/orvs_dataset.py
new file mode 100644
index 0000000000..e915ae4cd2
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/orvs/datasets/orvs_dataset.py
@@ -0,0 +1,27 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ORVSDataset(BaseSegDataset):
+    """ORVSDataset dataset.
+
+    In segmentation map annotation for ORVSDataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+    """
+    METAINFO = dict(classes=('background', 'vessel'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=False,
+            **kwargs)
diff --git a/projects/medical/2d_image/fundus_photography/orvs/tools/prepare_dataset.py b/projects/medical/2d_image/fundus_photography/orvs/tools/prepare_dataset.py
new file mode 100755
index 0000000000..f902d87101
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/orvs/tools/prepare_dataset.py
@@ -0,0 +1,55 @@
+import glob
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.jpg'
+seg_map_suffix_list = ['.jpg', '.png', '.tif']
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+x_train = glob.glob(
+    os.path.join('data/Vessels-Datasets/*/Train/Original/Images/*' +
+                 img_suffix))
+x_test = glob.glob(
+    os.path.join('data/Vessels-Datasets/*/Test/Original/Images/*' +
+                 img_suffix))
+
+os.system('mkdir -p ' + root_path + 'images/train/')
+os.system('mkdir -p ' + root_path + 'images/test/')
+os.system('mkdir -p ' + root_path + 'masks/train/')
+os.system('mkdir -p ' + root_path + 'masks/test/')
+
+part_dir_dict = {0: 'train/', 1: 'test/'}
+for ith, part in enumerate([x_train, x_test]):
+    part_dir = part_dir_dict[ith]
+    for img in part:
+        type_name = img.split('/')[-5]
+        basename = type_name + '_' + os.path.basename(img)
+        save_img_path = root_path + 'images/' + part_dir + basename.split(
+            '.')[0] + save_img_suffix
+        Image.open(img).save(save_img_path)
+
+        for seg_map_suffix in seg_map_suffix_list:
+            if os.path.exists('/'.join(img.split('/')[:-1]).replace(
+                    'Images', 'Labels')):
+                mask_path = img.replace('Images', 'Labels').replace(
+                    img_suffix, seg_map_suffix)
+            else:
+                mask_path = img.replace('Images', 'labels').replace(
+                    img_suffix, seg_map_suffix)
+            if os.path.exists(mask_path):
+                break
+        save_mask_path = root_path + 'masks/' + part_dir + basename.split(
+            '.')[0] + save_seg_map_suffix
+        masks = np.array(Image.open(mask_path).convert('L')).astype(np.uint8)
+        if len(np.unique(masks)) == 2 and 1 in np.unique(masks):
+            print(np.unique(masks))
+            pass
+        else:
+            masks[masks < 128] = 0
+            masks[masks >= 128] = 1
+        masks = Image.fromarray(masks)
+        masks.save(save_mask_path)
diff --git a/projects/medical/2d_image/fundus_photography/rite/README.md b/projects/medical/2d_image/fundus_photography/rite/README.md
new file mode 100644
index 0000000000..0aea9b00d1
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/rite/README.md
@@ -0,0 +1,135 @@
+# Retinal Images vessel Tree Extraction (RITE)
+
+## Description
+
+This project supports **`Retinal Images vessel Tree Extraction (RITE) `**, which can be downloaded from [here](https://opendatalab.com/RITE).
+
+### Dataset Overview
+
+The RITE (Retinal Images vessel Tree Extraction) is a database that enables comparative studies on segmentation or classification of arteries and veins on retinal fundus images, which is established based on the public available DRIVE database (Digital Retinal Images for Vessel Extraction). RITE contains 40 sets of images, equally separated into a training subset and a test subset, the same as DRIVE. The two subsets are built from the corresponding two subsets in DRIVE. For each set, there is a fundus photograph, a vessel reference standard. The fundus photograph is inherited from DRIVE. For the training set, the vessel reference standard is a modified version of 1st_manual from DRIVE. For the test set, the vessel reference standard is 2nd_manual from DRIVE.
+
+### Statistic Information
+
+| Dataset Name                         | Anatomical Region | Task Type    | Modality           | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                         |
+| ------------------------------------ | ----------------- | ------------ | ------------------ | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------- |
+| [Rite](https://opendatalab.com/RITE) | head_and_neck     | segmentation | fundus_photography | 2            | 20/-/20               | yes/-/yes              | 2013         | [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |     20     |   91.61    |    -     |    -     |    20     |   91.58   |
+|   vessel   |     20     |    8.39    |    -     |    -     |    20     |   8.42    |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![rite](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/fundus_photography/rite/rite_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@InProceedings{10.1007/978-3-642-40763-5_54,
+	author={Hu, Qiao and Abr{\`a}moff, Michael D. and Garvin, Mona K.},
+	title={Automated Separation of Binary Overlapping Trees in Low-Contrast Color Retinal Images},
+	booktitle={Medical Image Computing and Computer-Assisted Intervention -- MICCAI 2013},
+	year={2013},
+	pages={436--443},
+}
+
+
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0 9.3.0
+- scikit-learn(sklearn) v1.2.0 1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `rite/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://opendatalab.com/RITE) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── fundus_photography
+  │   │   │   │   ├── rite
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_rite-512x512.py b/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_rite-512x512.py
new file mode 100644
index 0000000000..27dd4363b1
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_rite-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './rite_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.rite_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_rite-512x512.py b/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_rite-512x512.py
new file mode 100644
index 0000000000..48f6f973a1
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_rite-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './rite_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.rite_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_rite-512x512.py b/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_rite-512x512.py
new file mode 100644
index 0000000000..5f5b24ba6a
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_rite-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './rite_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.rite_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_rite-512x512.py b/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_rite-512x512.py
new file mode 100644
index 0000000000..bf66b6f320
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/rite/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_rite-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './rite_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.rite_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/fundus_photography/rite/configs/rite_512x512.py b/projects/medical/2d_image/fundus_photography/rite/configs/rite_512x512.py
new file mode 100644
index 0000000000..02f620c665
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/rite/configs/rite_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'RITEDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='test.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/fundus_photography/rite/datasets/rite_dataset.py b/projects/medical/2d_image/fundus_photography/rite/datasets/rite_dataset.py
new file mode 100644
index 0000000000..99f688de94
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/rite/datasets/rite_dataset.py
@@ -0,0 +1,31 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class RITEDataset(BaseSegDataset):
+    """RITEDataset dataset.
+
+    In segmentation map annotation for RITEDataset,
+    0 stands for background, which is included in 2 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('background', 'vessel'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/fundus_photography/rite/tools/prepare_dataset.py b/projects/medical/2d_image/fundus_photography/rite/tools/prepare_dataset.py
new file mode 100644
index 0000000000..ca7e996961
--- /dev/null
+++ b/projects/medical/2d_image/fundus_photography/rite/tools/prepare_dataset.py
@@ -0,0 +1,98 @@
+import glob
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.tif'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+src_img_train_dir = os.path.join(root_path, 'AV_groundTruth/training/images/')
+src_img_test_dir = os.path.join(root_path, 'AV_groundTruth/test/images/')
+src_mask_train_dir = os.path.join(root_path, 'AV_groundTruth/training/vessel/')
+src_mask_test_dir = os.path.join(root_path, 'AV_groundTruth/test/vessel/')
+
+tgt_img_train_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_train_dir = os.path.join(root_path, 'masks/train/')
+tgt_img_test_dir = os.path.join(root_path, 'images/test/')
+tgt_mask_test_dir = os.path.join(root_path, 'masks/test/')
+os.system('mkdir -p ' + tgt_img_train_dir)
+os.system('mkdir -p ' + tgt_mask_train_dir)
+os.system('mkdir -p ' + tgt_img_test_dir)
+os.system('mkdir -p ' + tgt_mask_test_dir)
+
+
+def filter_suffix_recursive(src_dir, suffix):
+    # filter out file names and paths in source directory
+    suffix = '.' + suffix if '.' not in suffix else suffix
+    file_paths = glob.glob(
+        os.path.join(src_dir, '**', '*' + suffix), recursive=True)
+    file_names = [_.split('/')[-1] for _ in file_paths]
+    return sorted(file_paths), sorted(file_names)
+
+
+def convert_label(img, convert_dict):
+    arr = np.zeros_like(img, dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr[img == c] = i
+    return arr
+
+
+def convert_pics_into_pngs(src_dir, tgt_dir, suffix, convert='RGB'):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_img_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+        num = len(src_paths)
+        img = np.array(Image.open(src_path))
+        if len(img.shape) == 2:
+            pil = Image.fromarray(img).convert(convert)
+        elif len(img.shape) == 3:
+            pil = Image.fromarray(img)
+        else:
+            raise ValueError('Input image not 2D/3D: ', img.shape)
+
+        pil.save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+def convert_label_pics_into_pngs(src_dir,
+                                 tgt_dir,
+                                 suffix,
+                                 convert_dict={
+                                     0: 0,
+                                     255: 1
+                                 }):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    num = len(src_paths)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_seg_map_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+
+        img = np.array(Image.open(src_path))
+        img = convert_label(img, convert_dict)
+        Image.fromarray(img).save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+if __name__ == '__main__':
+
+    convert_pics_into_pngs(
+        src_img_train_dir, tgt_img_train_dir, suffix=img_suffix)
+
+    convert_pics_into_pngs(
+        src_img_test_dir, tgt_img_test_dir, suffix=img_suffix)
+
+    convert_label_pics_into_pngs(
+        src_mask_train_dir, tgt_mask_train_dir, suffix=seg_map_suffix)
+
+    convert_label_pics_into_pngs(
+        src_mask_test_dir, tgt_mask_test_dir, suffix=seg_map_suffix)
diff --git a/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/README.md b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/README.md
new file mode 100644
index 0000000000..97c4a0f0e5
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/README.md
@@ -0,0 +1,123 @@
+# breastCancerCellSegmentation
+
+## Description
+
+This project supports **`breastCancerCellSegmentation`**, which can be downloaded from [here](https://www.heywhale.com/mw/dataset/5e9e9b35ebb37f002c625423).
+
+### Dataset Overview
+
+This dataset, with 58 H&E-stained histopathology images was used for breast cancer cell detection and associated real-world data.
+Conventional histology uses a combination of hematoxylin and eosin stains, commonly referred to as H&E. These images are stained because most cells are inherently transparent with little or no intrinsic pigment.
+Certain special stains selectively bind to specific components and can be used to identify biological structures such as cells.
+
+### Original Statistic Information
+
+| Dataset name                                                                                 | Anatomical region | Task type    | Modality       | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                         |
+| -------------------------------------------------------------------------------------------- | ----------------- | ------------ | -------------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------- |
+| [breastCancerCellSegmentation](https://www.heywhale.com/mw/dataset/5e9e9b35ebb37f002c625423) | cell              | segmentation | histopathology | 2            | 58/-/-                | yes/-/-                | 2020         | [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+|    Class Name    | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|    background    |     58     |   98.37    |    -     |    -     |     -     |     -     |
+| breastCancerCell |     58     |    1.63    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/histopathology/breastCancerCellSegmentation/breastCancerCellSegmentation_dataset.png)
+
+## Usage
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow (PIL) v9.3.0
+- scikit-learn (sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `breastCancerCellSegmentation/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- Download dataset from [here](https://www.heywhale.com/mw/dataset/5e9e9b35ebb37f002c625423) and save it to the `data/` directory .
+- Decompress data to path `data/`. This will create a new folder named `data/breastCancerCellSegmentation/`, which contains the original image data.
+- run script `python tools/prepare_dataset.py` to format data and change folder structure as below.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── histopathology
+  │   │   │   │   ├── breastCancerCellSegmentation
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── breastCancerCellSegmentation
+  |   │   │   │   │   │   │   ├── train.txt
+  |   │   │   │   │   │   │   ├── val.txt
+  |   │   │   │   │   │   │   ├── images
+  |   │   │   │   │   │   │   |   ├── xxx.tif
+  |   │   │   │   │   │   │   ├── masks
+  |   │   │   │   │   │   │   |   ├── xxx.TIF
+
+```
+
+### Training commands
+
+Train models on a single server with one GPU.
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+Test models on a single server with one GPU.
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/breastCancerCellSegmentation_512x512.py b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/breastCancerCellSegmentation_512x512.py
new file mode 100644
index 0000000000..1cf0fccf5b
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/breastCancerCellSegmentation_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'breastCancerCellSegmentationDataset'
+data_root = 'data/breastCancerCellSegmentation'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile', imdecode_backend='tifffile'),
+    dict(type='LoadAnnotations', imdecode_backend='tifffile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile', imdecode_backend='tifffile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations', imdecode_backend='tifffile'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images', seg_map_path='masks'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images', seg_map_path='masks'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_breastCancerCellSegmentation-512x512.py b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_breastCancerCellSegmentation-512x512.py
new file mode 100644
index 0000000000..55d1708968
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_breastCancerCellSegmentation-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './breastCancerCellSegmentation_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.breastCancerCellSegmentation_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_breastCancerCellSegmentation-512x512.py b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_breastCancerCellSegmentation-512x512.py
new file mode 100644
index 0000000000..cf28aad739
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_breastCancerCellSegmentation-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './breastCancerCellSegmentation_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.breastCancerCellSegmentation_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_breastCancerCellSegmentation-512x512.py b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_breastCancerCellSegmentation-512x512.py
new file mode 100644
index 0000000000..29aaff3894
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_breastCancerCellSegmentation-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './breastCancerCellSegmentation_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.breastCancerCellSegmentation_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/datasets/breastCancerCellSegmentation_dataset.py b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/datasets/breastCancerCellSegmentation_dataset.py
new file mode 100644
index 0000000000..eeceb6318c
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/datasets/breastCancerCellSegmentation_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class breastCancerCellSegmentationDataset(BaseSegDataset):
+    """breastCancerCellSegmentationDataset dataset.
+
+    In segmentation map annotation for breastCancerCellSegmentationDataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('background', 'breastCancerCell'))
+
+    def __init__(self,
+                 img_suffix='_ccd.tif',
+                 seg_map_suffix='.TIF',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/tools/prepare_dataset.py b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/tools/prepare_dataset.py
new file mode 100644
index 0000000000..09cc689c86
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breastCancerCellSegmentation/tools/prepare_dataset.py
@@ -0,0 +1,36 @@
+import argparse
+import glob
+import os
+
+from sklearn.model_selection import train_test_split
+
+
+def save_anno(img_list, file_path, suffix):
+    # 只保留文件名，不保留后缀
+    img_list = [x.split('/')[-1][:-len(suffix)] for x in img_list]
+
+    with open(file_path, 'w') as file_:
+        for x in list(img_list):
+            file_.write(x + '\n')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--data_root', default='data/breastCancerCellSegmentation/')
+    args = parser.parse_args()
+    data_root = args.data_root
+
+    # 1. 划分训练集、验证集
+    # 1.1 获取所有图片路径
+    img_list = glob.glob(os.path.join(data_root, 'images', '*.tif'))
+    img_list.sort()
+    mask_list = glob.glob(os.path.join(data_root, 'masks', '*.TIF'))
+    mask_list.sort()
+    assert len(img_list) == len(mask_list)
+    # 1.2 划分训练集、验证集、测试集
+    train_img_list, val_img_list, train_mask_list, val_mask_list = train_test_split(  # noqa
+        img_list, mask_list, test_size=0.2, random_state=42)
+    # 1.3 保存划分结果
+    save_anno(train_img_list, os.path.join(data_root, 'train.txt'), '_ccd.tif')
+    save_anno(val_img_list, os.path.join(data_root, 'val.txt'), '_ccd.tif')
diff --git a/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/README.md b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/README.md
new file mode 100644
index 0000000000..b6f1ca6341
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/README.md
@@ -0,0 +1,147 @@
+# Breast Cancer Cell Segmentation
+
+## Description
+
+This project support **`Breast Cancer Cell Segmentation`**, and the dataset used in this project can be downloaded from [here](https://tianchi.aliyun.com/dataset/dataDetail?dataId=90152).
+
+### Dataset Overview
+
+In this dataset, there are 58 H&E stained histopathology images used in breast cancer cell detection with associated ground truth data available. Routine histology uses the stain combination of hematoxylin and eosin, commonly referred to as H&E. These images are stained since most cells are essentially transparent, with little or no intrinsic pigment. Certain special stains, which bind selectively to particular components, are be used to identify biological structures such as cells. In those images, the challenging problem is cell segmentation for subsequent classification in benign and malignant cells.
+
+### Original Statistic Information
+
+| Dataset name                                                                                  | Anatomical region | Task type    | Modality       | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                                                                |
+| --------------------------------------------------------------------------------------------- | ----------------- | ------------ | -------------- | ------------ | --------------------- | ---------------------- | ------------ | ------------------------------------------------------------------------------------------------------ |
+| [Breast Cancer Cell Segmentation](https://tianchi.aliyun.com/dataset/dataDetail?dataId=90152) | thorax            | segmentation | histopathology | 2            | 58/-/-                | yes/-/-                | 2021         | [CC-BY-SA-NC 4.0](http://creativecommons.org/licenses/by-sa/4.0/?spm=5176.12282016.0.0.3f5b5291ypBxb2) |
+
+|     Class Name     | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :----------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|       normal       |     58     |   98.37    |    -     |    -     |     -     |     -     |
+| breast cancer cell |     58     |    1.63    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/histopathology/breast_cancer_cell_seg/breast_cancer_cell_seg_dataset.png)
+
+## Dataset Citation
+
+```
+@inproceedings{gelasca2008evaluation,
+  title={Evaluation and benchmark for biological image segmentation},
+  author={Gelasca, Elisa Drelie and Byun, Jiyun and Obara, Boguslaw and Manjunath, BS},
+  booktitle={2008 15th IEEE international conference on image processing},
+  pages={1816--1819},
+  year={2008},
+  organization={IEEE}
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `breast_cancer_cell_seg/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://tianchi.aliyun.com/dataset/dataDetail?dataId=90152) and decompression data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── histopathology
+  │   │   │   │   ├── breast_cancer_cell_seg
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+|  Class Name  | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :----------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|  background  |     46     |   98.36    |    12    |  98.41   |     -     |     -     |
+| erythrocytes |     46     |    1.64    |    12    |   1.59   |     -     |     -     |
+
+### Training commands
+
+Train models on a single server with one GPU.
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+Test models on a single server with one GPU.
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/breast-cancer-cell-seg_512x512.py b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/breast-cancer-cell-seg_512x512.py
new file mode 100644
index 0000000000..ead40e4345
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/breast-cancer-cell-seg_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'BreastCancerCellSegDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_breast-cancer-cell-seg-512x512.py b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_breast-cancer-cell-seg-512x512.py
new file mode 100644
index 0000000000..691a0ff613
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_breast-cancer-cell-seg-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './breast-cancer-cell-seg_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.breast-cancer-cell-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_breast-cancer-cell-seg-512x512.py b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_breast-cancer-cell-seg-512x512.py
new file mode 100644
index 0000000000..719b767ab1
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_breast-cancer-cell-seg-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './breast-cancer-cell-seg_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.breast-cancer-cell-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_breast-cancer-cell-seg-512x512.py b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_breast-cancer-cell-seg-512x512.py
new file mode 100644
index 0000000000..9dfe70f761
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_breast-cancer-cell-seg-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './breast-cancer-cell-seg_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.breast-cancer-cell-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/datasets/breast-cancer-cell-seg_dataset.py b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/datasets/breast-cancer-cell-seg_dataset.py
new file mode 100644
index 0000000000..6f27029d39
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/datasets/breast-cancer-cell-seg_dataset.py
@@ -0,0 +1,29 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BreastCancerCellSegDataset(BaseSegDataset):
+    """BreastCancerCellSegDataset dataset.
+
+    In segmentation map annotation for BreastCancerCellSegDataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('normal', 'breast cancer cell'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=False,
+            **kwargs)
diff --git a/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/tools/prepare_dataset.py b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/tools/prepare_dataset.py
new file mode 100755
index 0000000000..775f2eed18
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/breast_cancer_cell_seg/tools/prepare_dataset.py
@@ -0,0 +1,47 @@
+import glob
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.tif'
+seg_map_suffix = '.TIF'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+x_train = glob.glob(
+    os.path.join('data/Breast Cancer Cell Segmentation_datasets/Images/*' +
+                 img_suffix))
+
+os.system('mkdir -p ' + root_path + 'images/train/')
+os.system('mkdir -p ' + root_path + 'masks/train/')
+
+D2_255_convert_dict = {0: 0, 255: 1}
+
+
+def convert_2d(img, convert_dict=D2_255_convert_dict):
+    arr_2d = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr_2d[img == c] = i
+    return arr_2d
+
+
+part_dir_dict = {0: 'train/'}
+for ith, part in enumerate([x_train]):
+    part_dir = part_dir_dict[ith]
+    for img in part:
+        basename = os.path.basename(img)
+        img_save_path = root_path + 'images/' + part_dir + basename.split(
+            '.')[0] + save_img_suffix
+        Image.open(img).save(img_save_path)
+        mask_path = root_path + 'Breast Cancer Cell Segmentation_datasets/Masks/' + '_'.join(  # noqa
+            basename.split('_')[:-1]) + seg_map_suffix
+        label = np.array(Image.open(mask_path))
+
+        save_mask_path = root_path + 'masks/' + part_dir + basename.split(
+            '.')[0] + save_seg_map_suffix
+        assert len(label.shape) == 2 and 255 in label and 1 not in label
+        mask = convert_2d(label)
+        mask = Image.fromarray(mask.astype(np.uint8))
+        mask.save(save_mask_path)
diff --git a/projects/medical/2d_image/histopathology/conic2022_seg/README.md b/projects/medical/2d_image/histopathology/conic2022_seg/README.md
new file mode 100644
index 0000000000..1f55b44ed6
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/conic2022_seg/README.md
@@ -0,0 +1,207 @@
+# CoNIC: Colon Nuclei Identification and Counting Challenge
+
+## Description
+
+This project supports **`CoNIC: Colon Nuclei Identification and Counting Challenge`**, which can be downloaded from [here](https://drive.google.com/drive/folders/1il9jG7uA4-ebQ_lNmXbbF2eOK9uNwheb).
+
+### Dataset Overview
+
+Nuclear segmentation, classification and quantification within Haematoxylin & Eosin stained histology images enables the extraction of interpretable cell-based features that can be used in downstream explainable models in computational pathology (CPath). To help drive forward research and innovation for automatic nuclei recognition in CPath, we organise the Colon Nuclei Identification and Counting (CoNIC) Challenge. The challenge requires researchers to develop algorithms that perform segmentation, classification and counting of 6 different types of nuclei within the current largest known publicly available nuclei-level dataset in CPath, containing around half a million labelled nuclei.
+
+### Task Information
+
+The CONIC challenge has 2 tasks:
+
+- Task 1: Nuclear segmentation and classification.
+
+The first task requires participants to segment nuclei within the tissue, while also classifying each nucleus into one of the following categories: epithelial, lymphocyte, plasma, eosinophil, neutrophil or connective tissue.
+
+- Task 2: Prediction of cellular composition.
+
+For the second task, we ask participants to predict how many nuclei of each class are present in each input image.
+
+The output of Task 1 can be directly used to perform Task 2, but these can be treated as independent tasks. Therefore, if it is preferred, prediction of cellular composition can be treated as a stand alone regression task.
+
+***NOTE：We only consider `Task 1` in the following sections.***
+
+### Original Statistic Information
+
+| Dataset name                                             | Anatomical region | Task type    | Modality       | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                                                                      |
+| -------------------------------------------------------- | ----------------- | ------------ | -------------- | ------------ | --------------------- | ---------------------- | ------------ | ------------------------------------------------------------------------------------------------------------ |
+| [CoNIC202](https://conic-challenge.grand-challenge.org/) | abdomen           | segmentation | histopathology | 7            | 4981/-/-              | yes/-/-                | 2022         | [Attribution-NonCommercial-ShareAlike 4.0 International](https://creativecommons.org/licenses/by-nc-sa/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    4981    |   83.97    |    -     |    -     |     -     |     -     |
+| neutrophil |    1218    |    0.13    |    -     |    -     |     -     |     -     |
+| epithelial |    4256    |   10.31    |    -     |    -     |     -     |     -     |
+| lymphocyte |    4473    |    1.85    |    -     |    -     |     -     |     -     |
+|   plasma   |    3316    |    0.55    |    -     |    -     |     -     |     -     |
+| eosinophil |    1456    |    0.1     |    -     |    -     |     -     |     -     |
+| connective |    4613    |    3.08    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/histopathology/conic2022_seg/conic2022_seg_dataset.png)
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `conic2022_seg/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://drive.google.com/drive/folders/1il9jG7uA4-ebQ_lNmXbbF2eOK9uNwheb/) and move data to path `'data/CoNIC_Challenge'`. The directory should be like:
+  ```shell
+  data/CoNIC_Challenge
+        ├── README.txt
+        ├── by-nc-sa.md
+        ├── counts.csv
+        ├── images.npy
+        ├── labels.npy
+        └── patch_info.csv
+  ```
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── histopathology
+  │   │   │   │   ├── conic2022_seg
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    3984    |   84.06    |   997    |  83.65   |     -     |     -     |
+| neutrophil |    956     |    0.12    |   262    |   0.13   |     -     |     -     |
+| epithelial |    3400    |   10.26    |   856    |  10.52   |     -     |     -     |
+| lymphocyte |    3567    |    1.83    |   906    |   1.96   |     -     |     -     |
+|   plasma   |    2645    |    0.55    |   671    |   0.56   |     -     |     -     |
+| eosinophil |    1154    |    0.1     |   302    |   0.1    |     -     |     -     |
+| connective |    3680    |    3.08    |   933    |   3.08   |     -     |     -     |
+
+### Training commands
+
+Train models on a single server with one GPU.
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+Test models on a single server with one GPU.
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Organizers
+
+- Simon Graham (TIA, PathLAKE)
+- Mostafa Jahanifar (TIA, PathLAKE)
+- Dang Vu (TIA)
+- Giorgos Hadjigeorghiou (TIA, PathLAKE)
+- Thomas Leech (TIA, PathLAKE)
+- David Snead (UHCW, PathLAKE)
+- Shan Raza (TIA, PathLAKE)
+- Fayyaz Minhas (TIA, PathLAKE)
+- Nasir Rajpoot (TIA, PathLAKE)
+
+TIA: Tissue Image Analytics Centre, Department of Computer Science, University of Warwick, United Kingdom
+
+UHCW: Department of Pathology, University Hospitals Coventry and Warwickshire, United Kingdom
+
+PathLAKE: Pathology Image Data Lake for Analytics Knowledge & Education, , University Hospitals Coventry and Warwickshire, United Kingdom
+
+## Dataset Citation
+
+If this work is helpful for your research, please consider citing the below paper.
+
+```
+@inproceedings{graham2021lizard,
+  title={Lizard: A large-scale dataset for colonic nuclear instance segmentation and classification},
+  author={Graham, Simon and Jahanifar, Mostafa and Azam, Ayesha and Nimir, Mohammed and Tsang, Yee-Wah and Dodd, Katherine and Hero, Emily and Sahota, Harvir and Tank, Atisha and Benes, Ksenija and others},
+  booktitle={Proceedings of the IEEE/CVF International Conference on Computer Vision},
+  pages={684--693},
+  year={2021}
+}
+@article{graham2021conic,
+  title={Conic: Colon nuclei identification and counting challenge 2022},
+  author={Graham, Simon and Jahanifar, Mostafa and Vu, Quoc Dang and Hadjigeorghiou, Giorgos and Leech, Thomas and Snead, David and Raza, Shan E Ahmed and Minhas, Fayyaz and Rajpoot, Nasir},
+  journal={arXiv preprint arXiv:2111.14485},
+  year={2021}
+}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/histopathology/conic2022_seg/configs/conic2022-seg_512x512.py b/projects/medical/2d_image/histopathology/conic2022_seg/configs/conic2022-seg_512x512.py
new file mode 100644
index 0000000000..51b4e5782a
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/conic2022_seg/configs/conic2022-seg_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'Conic2022SegDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_conic2022-512x512.py b/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_conic2022-512x512.py
new file mode 100644
index 0000000000..3e0248c78c
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_conic2022-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './conic2022-seg_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.conic2022-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=7),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_conic2022-512x512.py b/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_conic2022-512x512.py
new file mode 100644
index 0000000000..fd0e9d8d28
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_conic2022-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './conic2022-seg_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.conic2022-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=7),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_conic2022-512x512.py b/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_conic2022-512x512.py
new file mode 100644
index 0000000000..bb667f14fd
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/conic2022_seg/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_conic2022-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './conic2022-seg_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.conic2022-seg_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=7),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/conic2022_seg/conic2022_seg_dataset.png b/projects/medical/2d_image/histopathology/conic2022_seg/conic2022_seg_dataset.png
new file mode 100644
index 0000000000..65bb0bbe0a
Binary files /dev/null and b/projects/medical/2d_image/histopathology/conic2022_seg/conic2022_seg_dataset.png differ
diff --git a/projects/medical/2d_image/histopathology/conic2022_seg/datasets/conic2022-seg_dataset.py b/projects/medical/2d_image/histopathology/conic2022_seg/datasets/conic2022-seg_dataset.py
new file mode 100644
index 0000000000..9af0958ab3
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/conic2022_seg/datasets/conic2022-seg_dataset.py
@@ -0,0 +1,29 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class Conic2022SegDataset(BaseSegDataset):
+    """Conic2022SegDataset dataset.
+
+    In segmentation map annotation for Conic2022SegDataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+    """
+    METAINFO = dict(
+        classes=('background', 'neutrophil', 'epithelial', 'lymphocyte',
+                 'plasma', 'eosinophil', 'connective'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=False,
+            **kwargs)
diff --git a/projects/medical/2d_image/histopathology/conic2022_seg/tools/prepare_dataset.py b/projects/medical/2d_image/histopathology/conic2022_seg/tools/prepare_dataset.py
new file mode 100755
index 0000000000..89cfb4aae2
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/conic2022_seg/tools/prepare_dataset.py
@@ -0,0 +1,65 @@
+import glob
+import os
+import shutil
+
+import numpy as np
+from PIL import Image
+
+img_save_root = 'data/'
+root_path = 'data/'
+img_suffix = '.png'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+label_set = set()
+
+
+def save_masks_from_npz(data, save_root, part='masks/'):
+    global label_set
+    num = data.shape[0]
+    for i in range(num):
+        # np_img = data[i, :, :, :]
+        np_mask = data[i, :, :, 1]
+        label_set = set.union(label_set, set(np.unique(np_mask)))
+        img = Image.fromarray(np_mask)
+        save_path = os.path.join(save_root, part, str(i) + save_seg_map_suffix)
+        img.save(save_path)
+
+
+def save_images_from_npz(data, save_root, part='images/'):
+    num = data.shape[0]
+    for i in range(num):
+        np_img = data[i, :, :, :]
+        img = Image.fromarray(np_img)
+        save_path = os.path.join(save_root, part, str(i) + save_img_suffix)
+        img.save(save_path)
+
+
+images_npy = np.load('data/CoNIC_Challenge/images.npy')
+labels_npy = np.load('data/CoNIC_Challenge/labels.npy')
+
+os.system('mkdir -p ' + img_save_root + 'images_ori')
+os.system('mkdir -p ' + img_save_root + 'labels')
+save_images_from_npz(images_npy, img_save_root, 'images_ori')
+save_masks_from_npz(labels_npy, img_save_root, 'labels')
+print(label_set)
+
+x_train = glob.glob(os.path.join('data/images_ori/*' + img_suffix))
+
+os.system('mkdir -p ' + root_path + 'images/train/')
+os.system('mkdir -p ' + root_path + 'masks/train/')
+
+part_dir_dict = {0: 'train/', 1: 'val/'}
+for ith, part in enumerate([x_train]):
+    part_dir = part_dir_dict[ith]
+    for img in part:
+        basename = os.path.basename(img)
+        shutil.copy(
+            img, root_path + 'images/' + part_dir + basename.split('.')[0] +
+            save_img_suffix)
+        mask_path = root_path + 'labels/' + basename.split(
+            '.')[0] + seg_map_suffix
+        save_mask_path = root_path + 'masks/' + part_dir + basename.split(
+            '.')[0] + save_seg_map_suffix
+        shutil.copy(mask_path, save_mask_path)
diff --git a/projects/medical/2d_image/histopathology/consep/README.md b/projects/medical/2d_image/histopathology/consep/README.md
new file mode 100644
index 0000000000..ca3d7aa108
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/consep/README.md
@@ -0,0 +1,147 @@
+# Colorectal Nuclear Segmentation and Phenotypes (CoNSeP) Dataset
+
+## Description
+
+This project supports **`Colorectal Nuclear Segmentation and Phenotypes (CoNSeP) Dataset`**, which can be downloaded from [here](https://warwick.ac.uk/fac/cross_fac/tia/data/hovernet/).
+
+### Dataset Overview
+
+The CoNSeP (Colon Segmentation and Phenotyping) dataset consists of 41 H&E stained image tiles, each with a size of 1,000×1,000 pixels and a magnification of 40x. These images were extracted from 16 colorectal adenocarcinoma (CRA) whole slide images (WSI), each of which belonged to a separate patient and was scanned using an Omnyx VL120 scanner at the Pathology Department of the University Hospitals Coventry and Warwickshire NHS Trust, UK. This dataset was first used in  paper named, "HoVer-Net: Simultaneous Segmentation and Classification of Nuclei in Multi-Tissue Histology Images".
+
+### Original Statistic Information
+
+| Dataset name                                             | Anatomical region | Task type    | Modality       | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License |
+| -------------------------------------------------------- | ----------------- | ------------ | -------------- | ------------ | --------------------- | ---------------------- | ------------ | ------- |
+| [CoNIC202](https://conic-challenge.grand-challenge.org/) | abdomen           | segmentation | histopathology | 7            | 4981/-/-              | yes/-/-                | 2022         | -       |
+
+|           Class Name            | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :-----------------------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|           background            |     27     |   83.61    |    14    |   80.4   |     -     |     -     |
+|              other              |     17     |    0.17    |    9     |   0.52   |     -     |     -     |
+|          inflammatory           |     25     |    2.66    |    14    |   2.14   |     -     |     -     |
+|       healthy epithelial        |     3      |    1.47    |    2     |   1.58   |     -     |     -     |
+| dysplastic/malignant epithelial |     10     |    7.17    |    8     |   9.16   |     -     |     -     |
+|           fibroblast            |     23     |    3.84    |    14    |   4.63   |     -     |     -     |
+|             muscle              |     8      |    1.05    |    3     |   1.42   |     -     |     -     |
+|           endothelial           |     7      |    0.02    |    4     |   0.15   |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/histopathology/consep/consep_dataset.png)
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `conic2022_seg/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://opendatalab.com/CoNSeP) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── histopathology
+  │   │   │   │   ├── consep
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Training commands
+
+Train models on a single server with one GPU.
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+Test models on a single server with one GPU.
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Dataset Citation
+
+If this work is helpful for your research, please consider citing the below paper.
+
+```
+@article{graham2019hover,
+  title={Hover-net: Simultaneous segmentation and classification of nuclei in multi-tissue histology images},
+  author={Graham, Simon and Vu, Quoc Dang and Raza, Shan E Ahmed and Azam, Ayesha and Tsang, Yee Wah and Kwak, Jin Tae and Rajpoot, Nasir},
+  journal={Medical Image Analysis},
+  volume={58},
+  pages={101563},
+  year={2019},
+  publisher={Elsevier}
+}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/histopathology/consep/configs/consep_512x512.py b/projects/medical/2d_image/histopathology/consep/configs/consep_512x512.py
new file mode 100644
index 0000000000..0d9b8948b0
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/consep/configs/consep_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'ConsepDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_consep-512x512.py b/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_consep-512x512.py
new file mode 100644
index 0000000000..cbcf5db775
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_consep-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './consep_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.consep_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=8),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_consep-512x512.py b/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_consep-512x512.py
new file mode 100644
index 0000000000..b374566e6e
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_consep-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './consep_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.consep_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=8),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_consep-512x512.py b/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_consep-512x512.py
new file mode 100644
index 0000000000..35bdaa34c8
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/consep/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_consep-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    './consep_512x512.py', 'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.consep_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=8),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/consep/datasets/consep_dataset.py b/projects/medical/2d_image/histopathology/consep/datasets/consep_dataset.py
new file mode 100644
index 0000000000..ceb2b3ab25
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/consep/datasets/consep_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ConsepDataset(BaseSegDataset):
+    """ConsepDataset dataset.
+
+    In segmentation map annotation for ConsepDataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+    """
+    METAINFO = dict(
+        classes=('background', 'other', 'inflammatory', 'healthy epithelial',
+                 'dysplastic/malignant epithelial', 'fibroblast', 'muscle',
+                 'endothelial'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=False,
+            **kwargs)
diff --git a/projects/medical/2d_image/histopathology/consep/tools/prepare_dataset.py b/projects/medical/2d_image/histopathology/consep/tools/prepare_dataset.py
new file mode 100755
index 0000000000..83a2e18ce1
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/consep/tools/prepare_dataset.py
@@ -0,0 +1,54 @@
+import glob
+import os
+import shutil
+
+import numpy as np
+from PIL import Image
+from scipy.io import loadmat
+
+root_path = 'data/'
+img_suffix = '.png'
+seg_map_suffix = '.mat'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+x_train = glob.glob(os.path.join('data/CoNSeP/Train/Images/*' + img_suffix))
+x_test = glob.glob(os.path.join('data/CoNSeP/Test/Images/*' + img_suffix))
+
+os.system('mkdir -p ' + root_path + 'images/train/')
+os.system('mkdir -p ' + root_path + 'images/val/')
+os.system('mkdir -p ' + root_path + 'masks/train/')
+os.system('mkdir -p ' + root_path + 'masks/val/')
+D2_255_convert_dict = {0: 0, 255: 1}
+
+
+def convert_2d(img, convert_dict=D2_255_convert_dict):
+    arr_2d = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr_2d[img == c] = i
+    return arr_2d
+
+
+part_dir_dict = {0: 'CoNSeP/Train/', 1: 'CoNSeP/Test/'}
+save_dir_dict = {0: 'train/', 1: 'val/'}
+for ith, part in enumerate([x_train, x_test]):
+    part_dir = part_dir_dict[ith]
+    for img in part:
+        basename = os.path.basename(img)
+        shutil.copy(
+            img, root_path + 'images/' + save_dir_dict[ith] +
+            basename.split('.')[0] + save_img_suffix)
+
+        mask_path = root_path + part_dir + 'Labels/' + basename.split(
+            '.')[0] + seg_map_suffix
+        label_ = loadmat(mask_path)
+        label = label_['inst_map']
+        label_type = label_['inst_type']
+        label_dict = {i + 1: int(val) for i, val in enumerate(label_type)}
+
+        save_mask_path = root_path + 'masks/' + save_dir_dict[
+            ith] + basename.split('.')[0] + save_seg_map_suffix
+
+        res = convert_2d(label, convert_dict=label_dict)
+        res = Image.fromarray(res.astype(np.uint8))
+        res.save(save_mask_path)
diff --git a/projects/medical/2d_image/histopathology/fusc2021/README.md b/projects/medical/2d_image/histopathology/fusc2021/README.md
new file mode 100644
index 0000000000..8130d59350
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/fusc2021/README.md
@@ -0,0 +1,136 @@
+# Foot Ulcer Segmentation Challenge 2021 (FUSC 2021)
+
+## Description
+
+This project supports **`Foot Ulcer Segmentation Challenge 2021 (FUSC 2021) `**, which can be downloaded from [here](https://fusc.grand-challenge.org/).
+
+### Dataset Overview
+
+This chronic wound dataset was collected over 2 years from October 2019 to April 2021 at the center and contains 1,210 foot ulcer images taken from 889 patients during multiple clinical visits. The raw images were taken by Canon SX 620 HS digital camera and iPad Pro under uncontrolled illumination conditions,
+with various backgrounds. The images (shown in Figure 1) are randomly split into 3 subsets: a training set with 810 images, a validation set with 200 images, and a testing set with 200 images. Of course, the annotations of the testing set are kept private. The data collected were de-identified and in accordance with relevant guidelines and regulations and the patient’s informed consent is waived by the institutional review board of the University of Wisconsin-Milwaukee.
+
+### Information Statistics
+
+| Dataset Name                                  | Anatomical Region | Task Type    | Modality       | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                       |
+| --------------------------------------------- | ----------------- | ------------ | -------------- | ------------ | --------------------- | ---------------------- | ------------ | ------------------------------------------------------------- |
+| [fusc2021](https://fusc.grand-challenge.org/) | lower limb        | segmentation | histopathology | 2            | 810/200/200           | yes/yes/no             | 2021         | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    810     |   98.71    |   200    |  98.78   |     -     |     -     |
+|   wound    |    791     |    1.29    |   195    |   1.22   |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![fusc2021](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/histopathology/fusc2021/fusc2021_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@article{s41598-020-78799-w,
+	title={Fully automatic wound segmentation with deep convolutional neural networks},
+	author={Chuanbo Wang and D. M. Anisuzzaman and Victor Williamson and Mrinal Kanti Dhar and Behrouz Rostami and Jeffrey Niezgoda and Sandeep Gopalakrishnan and Zeyun Yu},
+	journal={Scientific Reports},
+	volume={10},
+	number={1},
+	pages={21897},
+	year={2020}
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `fusc2021/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://fusc.grand-challenge.org/) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── histopathology
+  │   │   │   │   ├── fusc2021
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_fusc2021-512x512.py b/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_fusc2021-512x512.py
new file mode 100644
index 0000000000..c3f4275112
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_fusc2021-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './fusc2021_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.fusc2021_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_fusc2021-512x512.py b/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_fusc2021-512x512.py
new file mode 100644
index 0000000000..ed870303ff
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_fusc2021-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './fusc2021_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.fusc2021_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_fusc2021-512x512.py b/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_fusc2021-512x512.py
new file mode 100644
index 0000000000..cbc09ae6cd
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_fusc2021-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './fusc2021_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.fusc2021_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_fusc2021-512x512.py b/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_fusc2021-512x512.py
new file mode 100644
index 0000000000..f1477ee725
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/fusc2021/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_fusc2021-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './fusc2021_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.fusc2021_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/fusc2021/configs/fusc2021_512x512.py b/projects/medical/2d_image/histopathology/fusc2021/configs/fusc2021_512x512.py
new file mode 100644
index 0000000000..e650474cea
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/fusc2021/configs/fusc2021_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'FUSC2021Dataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/histopathology/fusc2021/datasets/fusc2021_dataset.py b/projects/medical/2d_image/histopathology/fusc2021/datasets/fusc2021_dataset.py
new file mode 100644
index 0000000000..d331ac8c3a
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/fusc2021/datasets/fusc2021_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class FUSC2021Dataset(BaseSegDataset):
+    """FUSC2021Dataset dataset.
+
+    In segmentation map annotation for FUSC2021Dataset, 0 stands for background
+    , which is included in 2 categories. ``reduce_zero_label``
+    is fixed to False. The ``img_suffix`` is fixed to '.png' and
+    ``seg_map_suffix`` is fixed to '.png'.
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False..
+    """
+    METAINFO = dict(classes=('background', 'wound'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/histopathology/fusc2021/tools/prepare_dataset.py b/projects/medical/2d_image/histopathology/fusc2021/tools/prepare_dataset.py
new file mode 100644
index 0000000000..8f2de3daa9
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/fusc2021/tools/prepare_dataset.py
@@ -0,0 +1,114 @@
+import glob
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.png'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+src_img_train_dir = os.path.join(
+    root_path, 'wound-segmentation/data/' +
+    'Foot Ulcer Segmentation Challenge/train/images')
+src_img_val_dir = os.path.join(
+    root_path, 'wound-segmentation/data/' +
+    'Foot Ulcer Segmentation Challenge/validation/images')
+src_img_test_dir = os.path.join(
+    root_path, 'wound-segmentation/data/' +
+    'Foot Ulcer Segmentation Challenge/test/images')
+src_mask_train_dir = os.path.join(
+    root_path, 'wound-segmentation/data/' +
+    'Foot Ulcer Segmentation Challenge/train/labels')
+src_mask_val_dir = os.path.join(
+    root_path, 'wound-segmentation/data/' +
+    'Foot Ulcer Segmentation Challenge/validation/labels')
+
+tgt_img_train_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_train_dir = os.path.join(root_path, 'masks/train/')
+tgt_img_val_dir = os.path.join(root_path, 'images/val/')
+tgt_mask_val_dir = os.path.join(root_path, 'masks/val/')
+tgt_img_test_dir = os.path.join(root_path, 'images/test/')
+os.system('mkdir -p ' + tgt_img_train_dir)
+os.system('mkdir -p ' + tgt_img_val_dir)
+os.system('mkdir -p ' + tgt_img_test_dir)
+os.system('mkdir -p ' + tgt_mask_train_dir)
+os.system('mkdir -p ' + tgt_mask_val_dir)
+
+
+def filter_suffix_recursive(src_dir, suffix):
+    # filter out file names and paths in source directory
+    suffix = '.' + suffix if '.' not in suffix else suffix
+    file_paths = glob.glob(
+        os.path.join(src_dir, '**', '*' + suffix), recursive=True)
+    file_names = [_.split('/')[-1] for _ in file_paths]
+    return sorted(file_paths), sorted(file_names)
+
+
+def convert_label(img, convert_dict):
+    arr = np.zeros_like(img, dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr[img == c] = i
+    return arr
+
+
+def convert_pics_into_pngs(src_dir, tgt_dir, suffix, convert='RGB'):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_img_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+        num = len(src_paths)
+        img = np.array(Image.open(src_path))
+        if len(img.shape) == 2:
+            pil = Image.fromarray(img).convert(convert)
+        elif len(img.shape) == 3:
+            pil = Image.fromarray(img)
+        else:
+            raise ValueError('Input image not 2D/3D: ', img.shape)
+
+        pil.save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+def convert_label_pics_into_pngs(src_dir,
+                                 tgt_dir,
+                                 suffix,
+                                 convert_dict={
+                                     0: 0,
+                                     255: 1
+                                 }):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    num = len(src_paths)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, save_seg_map_suffix)
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+
+        img = np.array(Image.open(src_path).convert('L'))
+        img = convert_label(img, convert_dict)
+        Image.fromarray(img).save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+if __name__ == '__main__':
+
+    convert_pics_into_pngs(
+        src_img_train_dir, tgt_img_train_dir, suffix=img_suffix)
+
+    convert_pics_into_pngs(src_img_val_dir, tgt_img_val_dir, suffix=img_suffix)
+
+    convert_pics_into_pngs(
+        src_img_test_dir, tgt_img_test_dir, suffix=img_suffix)
+
+    convert_label_pics_into_pngs(
+        src_mask_train_dir, tgt_mask_train_dir, suffix=seg_map_suffix)
+
+    convert_label_pics_into_pngs(
+        src_mask_val_dir, tgt_mask_val_dir, suffix=seg_map_suffix)
diff --git a/projects/medical/2d_image/histopathology/pannuke/README.md b/projects/medical/2d_image/histopathology/pannuke/README.md
new file mode 100644
index 0000000000..e0cade7536
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pannuke/README.md
@@ -0,0 +1,146 @@
+# Pan-Cancer Histology Dataset for Nuclei Instance Segmentation and Classification (PanNuke)
+
+## Description
+
+This project supports **`Pan-Cancer Histology Dataset for Nuclei Instance Segmentation and Classification (PanNuke)`**, which can be downloaded from [here](https://academictorrents.com/details/99f2c7b57b95500711e33f2ee4d14c9fd7c7366c).
+
+### Dataset Overview
+
+Semi automatically generated nuclei instance segmentation and classification dataset with exhaustive nuclei labels across 19 different tissue types. The dataset consists of 481 visual fields, of which 312 are randomly sampled from more than 20K whole slide images at different magnifications, from multiple data sources. In total the dataset contains 205,343 labeled nuclei, each with an instance segmentation mask. Models trained on pannuke can aid in whole slide image tissue type segmentation, and generalise to new tissues. PanNuke demonstrates one of the first successfully semi-automatically generated datasets.
+
+### Statistic Information
+
+| Dataset Name                                                                             | Anatomical Region | Task Type    | Modality       | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                         |
+| ---------------------------------------------------------------------------------------- | ----------------- | ------------ | -------------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------- |
+| [Pannuke](https://academictorrents.com/details/99f2c7b57b95500711e33f2ee4d14c9fd7c7366c) | full_body         | segmentation | histopathology | 6            | 7901/-/-              | yes/-/-                | 2019         | [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+|        Class Name         | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :-----------------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|        background         |    7901    |   83.32    |    -     |    -     |     -     |     -     |
+|        neoplastic         |    4190    |    8.64    |    -     |    -     |     -     |     -     |
+| non-neoplastic epithelial |    4126    |    1.77    |    -     |    -     |     -     |     -     |
+|       inflammatory        |    6137    |    3.73    |    -     |    -     |     -     |     -     |
+|        connective         |    232     |    0.07    |    -     |    -     |     -     |     -     |
+|           dead            |    1528    |    2.47    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![pannuke](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/histopathology/pannuke/pannuke_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@inproceedings{gamper2019pannuke,
+  title={PanNuke: an open pan-cancer histology dataset for nuclei instance segmentation and classification},
+  author={Gamper, Jevgenij and Koohbanani, Navid Alemi and Benet, Ksenija and Khuram, Ali and Rajpoot, Nasir},
+  booktitle={European Congress on Digital Pathology},
+  pages={11--19},
+  year={2019},
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0 9.3.0
+- scikit-learn(sklearn) v1.2.0 1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `pannuke/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://academictorrents.com/details/99f2c7b57b95500711e33f2ee4d14c9fd7c7366c) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── histopathology
+  │   │   │   │   ├── pannuke
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+|        Class Name         | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :-----------------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|        background         |    6320    |   83.38    |   1581   |   83.1   |     -     |     -     |
+|        neoplastic         |    3339    |    8.55    |   851    |   9.0    |     -     |     -     |
+| non-neoplastic epithelial |    3293    |    1.77    |   833    |   1.76   |     -     |     -     |
+|       inflammatory        |    4914    |    3.72    |   1223   |   3.76   |     -     |     -     |
+|        connective         |    170     |    0.06    |    62    |   0.09   |     -     |     -     |
+|           dead            |    1235    |    2.51    |   293    |   2.29   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_bactteria-detection-512x512.py b/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_bactteria-detection-512x512.py
new file mode 100644
index 0000000000..92584e9a68
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_bactteria-detection-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './bactteria-detection_512x512.py', 'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.bactteria-detection_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_pannuke-512x512.py b/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_pannuke-512x512.py
new file mode 100644
index 0000000000..042a08ce00
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_pannuke-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './pannuke_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.pannuke_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=6),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_pannuke-512x512.py b/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_pannuke-512x512.py
new file mode 100644
index 0000000000..e92514c913
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_pannuke-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './pannuke_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.pannuke_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=6),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_pannuke-512x512.py b/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_pannuke-512x512.py
new file mode 100644
index 0000000000..a9403c849f
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pannuke/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_pannuke-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './pannuke_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.pannuke_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=6),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/pannuke/configs/pannuke_512x512.py b/projects/medical/2d_image/histopathology/pannuke/configs/pannuke_512x512.py
new file mode 100644
index 0000000000..316ac1ac44
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pannuke/configs/pannuke_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'PanNukeDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/histopathology/pannuke/datasets/pannuke_dataset.py b/projects/medical/2d_image/histopathology/pannuke/datasets/pannuke_dataset.py
new file mode 100644
index 0000000000..4d3c687ff3
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pannuke/datasets/pannuke_dataset.py
@@ -0,0 +1,33 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class PanNukeDataset(BaseSegDataset):
+    """PanNukeDataset dataset.
+
+    In segmentation map annotation for PanNukeDataset,
+    0 stands for background, which is included in 6 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(
+        classes=('background', 'neoplastic', 'non-neoplastic epithelial',
+                 'inflammatory', 'connective', 'dead'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/histopathology/pannuke/tools/prepare_dataset.py b/projects/medical/2d_image/histopathology/pannuke/tools/prepare_dataset.py
new file mode 100644
index 0000000000..7213b181f4
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pannuke/tools/prepare_dataset.py
@@ -0,0 +1,49 @@
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+
+tgt_img_dir = os.path.join(root_path, 'images/train')
+tgt_mask_dir = os.path.join(root_path, 'masks/train')
+os.system('mkdir -p ' + tgt_img_dir)
+os.system('mkdir -p ' + tgt_mask_dir)
+
+fold_img_paths = sorted([
+    os.path.join(root_path, 'pannuke/Fold 1/images/fold1/images.npy'),
+    os.path.join(root_path, 'pannuke/Fold 2/images/fold2/images.npy'),
+    os.path.join(root_path, 'pannuke/Fold 3/images/fold3/images.npy')
+])
+
+fold_mask_paths = sorted([
+    os.path.join(root_path, 'pannuke/Fold 1/masks/fold1/masks.npy'),
+    os.path.join(root_path, 'pannuke/Fold 2/masks/fold2/masks.npy'),
+    os.path.join(root_path, 'pannuke/Fold 3/masks/fold3/masks.npy')
+])
+
+for n, (img_path,
+        mask_path) in enumerate(zip(fold_img_paths, fold_mask_paths)):
+    fold_name = str(n + 1)
+    imgs = np.load(img_path)
+    masks = np.load(mask_path)
+
+    for i in range(imgs.shape[0]):
+        img = np.uint8(imgs[i])
+        mask_multichannel = np.minimum(np.uint8(masks[i]), 1)
+        mask = np.zeros((img.shape[0], img.shape[1]), dtype=np.uint8)
+        for j in range(mask_multichannel.shape[-1]):
+            factor = (j + 1) % mask_multichannel.shape[-1]
+            # convert [0,1,2,3,4,5] to [1,2,3,4,5,0],
+            # with the last label being background
+            mask[mask_multichannel[..., j] == 1] = factor
+
+        file_name = 'fold' + fold_name + '_' + str(i).rjust(4, '0') + '.png'
+        print('Processing: ', file_name)
+        tgt_img_path = os.path.join(tgt_img_dir, file_name)
+        tgt_mask_path = os.path.join(tgt_mask_dir, file_name)
+        Image.fromarray(img).save(tgt_img_path)
+        Image.fromarray(mask).save(tgt_mask_path)
+
+    del imgs
+    del masks
diff --git a/projects/medical/2d_image/histopathology/pcam/README.md b/projects/medical/2d_image/histopathology/pcam/README.md
new file mode 100644
index 0000000000..5a8094950c
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pcam/README.md
@@ -0,0 +1,153 @@
+# PCam (PatchCamelyon)
+
+## Description
+
+This project supports **`Patch Camelyon (PCam) `**, which can be downloaded from [here](https://opendatalab.com/PCam).
+
+### Dataset Overview
+
+PatchCamelyon is an image classification dataset. It consists of 327680 color images (96 x 96px) extracted from histopathologic scans of lymph node sections. Each image is annotated with a binary label indicating presence of metastatic tissue. PCam provides a new benchmark for machine learning models: bigger than CIFAR10, smaller than ImageNet, trainable on a single GPU.
+
+### Statistic Information
+
+| Dataset Name                         | Anatomical Region | Task Type    | Modality       | Num. Classes | Train/Val/Test images | Train/Val/Test Labeled | Release Date | License                                                       |
+| ------------------------------------ | ----------------- | ------------ | -------------- | ------------ | --------------------- | ---------------------- | ------------ | ------------------------------------------------------------- |
+| [Pcam](https://opendatalab.com/PCam) | throax            | segmentation | histopathology | 2            | 327680/-/-            | yes/-/-                | 2018         | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) |
+
+|    Class Name     | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :---------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|    background     |   214849   |   63.77    |    -     |    -     |     -     |     -     |
+| metastatic tissue |   131832   |   36.22    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![pcam](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/histopathology/pcam/pcam_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@inproceedings{veeling2018rotation,
+	title={Rotation equivariant CNNs for digital pathology},
+	author={Veeling, Bastiaan S and Linmans, Jasper and Winkens, Jim and Cohen, Taco and Welling, Max},
+	booktitle={International Conference on Medical image computing and computer-assisted intervention},
+	pages={210--218},
+	year={2018},
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0 9.3.0
+- scikit-learn(sklearn) v1.2.0 1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `pcam/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://opendatalab.com/PCam) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```shell
+mkdir data & cd data
+pip install opendatalab
+odl get PCam
+mv ./PCam/raw/pcamv1 ./
+rm -rf PCam
+cd ..
+python tools/prepare_dataset.py
+python ../../tools/split_seg_dataset.py
+```
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── histopathology
+  │   │   │   │   ├── pcam
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+|    Class Name     | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :---------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|    background     |   171948   |   63.82    |  42901   |   63.6   |     -     |     -     |
+| metastatic tissue |   105371   |   36.18    |  26461   |   36.4   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_pcam-512x512.py b/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_pcam-512x512.py
new file mode 100644
index 0000000000..20601f1ea5
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_pcam-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './pcam_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.pcam_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_pcam-512x512.py b/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_pcam-512x512.py
new file mode 100644
index 0000000000..c057535409
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_pcam-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './pcam_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.pcam_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_pcam-512x512.py b/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_pcam-512x512.py
new file mode 100644
index 0000000000..4c1d5fe421
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_pcam-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './pcam_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.pcam_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_pcam-512x512.py b/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_pcam-512x512.py
new file mode 100644
index 0000000000..25e3734795
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pcam/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_pcam-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './pcam_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.pcam_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/histopathology/pcam/configs/pcam_512x512.py b/projects/medical/2d_image/histopathology/pcam/configs/pcam_512x512.py
new file mode 100644
index 0000000000..04efc23eb5
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pcam/configs/pcam_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'PCamDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/histopathology/pcam/datasets/pcam_dataset.py b/projects/medical/2d_image/histopathology/pcam/datasets/pcam_dataset.py
new file mode 100644
index 0000000000..1c27de543a
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pcam/datasets/pcam_dataset.py
@@ -0,0 +1,31 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class PCamDataset(BaseSegDataset):
+    """PCamDataset dataset.
+
+    In segmentation map annotation for PCamDataset,
+    0 stands for background, which is included in 2 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('background', 'metastatic tissue'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/histopathology/pcam/tools/prepare_dataset.py b/projects/medical/2d_image/histopathology/pcam/tools/prepare_dataset.py
new file mode 100644
index 0000000000..75038e6fb4
--- /dev/null
+++ b/projects/medical/2d_image/histopathology/pcam/tools/prepare_dataset.py
@@ -0,0 +1,49 @@
+import os
+
+import h5py
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+
+tgt_img_train_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_train_dir = os.path.join(root_path, 'masks/train/')
+tgt_img_val_dir = os.path.join(root_path, 'images/val/')
+tgt_img_test_dir = os.path.join(root_path, 'images/test/')
+
+os.system('mkdir -p ' + tgt_img_train_dir)
+os.system('mkdir -p ' + tgt_mask_train_dir)
+os.system('mkdir -p ' + tgt_img_val_dir)
+os.system('mkdir -p ' + tgt_img_test_dir)
+
+
+def extract_pics_from_h5(h5_path, h5_key, save_dir):
+    f = h5py.File(h5_path, 'r')
+    for i, img in enumerate(f[h5_key]):
+        img = img.astype(np.uint8).squeeze()
+        img = Image.fromarray(img)
+        save_image_path = os.path.join(save_dir, str(i).zfill(8) + '.png')
+        img.save(save_image_path)
+
+
+if __name__ == '__main__':
+
+    extract_pics_from_h5(
+        'data/pcamv1/camelyonpatch_level_2_split_train_x.h5',
+        h5_key='x',
+        save_dir=tgt_img_train_dir)
+
+    extract_pics_from_h5(
+        'data/pcamv1/camelyonpatch_level_2_split_valid_x.h5',
+        h5_key='x',
+        save_dir=tgt_img_val_dir)
+
+    extract_pics_from_h5(
+        'data/pcamv1/camelyonpatch_level_2_split_test_x.h5',
+        h5_key='x',
+        save_dir=tgt_img_test_dir)
+
+    extract_pics_from_h5(
+        'data/pcamv1/camelyonpatch_level_2_split_train_mask.h5',
+        h5_key='mask',
+        save_dir=tgt_mask_train_dir)
diff --git a/projects/medical/2d_image/infrared_reflectance_imaging/ravir/README.md b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/README.md
new file mode 100644
index 0000000000..ca95921ba3
--- /dev/null
+++ b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/README.md
@@ -0,0 +1,167 @@
+# RAVIR: A Dataset and Methodology for the Semantic Segmentation and Quantitative Analysis of Retinal Arteries and Veins in Infrared Reflectance Imaging
+
+## Description
+
+This project support **`RAVIR: A Dataset and Methodology for the Semantic Segmentation and Quantitative Analysis of Retinal Arteries and Veins in Infrared Reflectance Imaging`**, and the dataset used in this project can be downloaded from [here](https://ravir.grand-challenge.org/).
+
+### Dataset Overview
+
+The retinal vasculature provides important clues in the diagnosis and monitoring of systemic diseases including hypertension and diabetes. The microvascular system is of primary involvement in such conditions, and the retina is the only anatomical site where the microvasculature can be directly observed. The objective assessment of retinal vessels has long been considered a surrogate biomarker for systemic vascular diseases, and with recent advancements in retinal imaging and computer vision technologies, this topic has become the subject of renewed attention. In this paper, we present a novel dataset, dubbed RAVIR, for the semantic segmentation of Retinal Arteries and Veins in Infrared Reflectance (IR) imaging. It enables the creation of deep learning-based models that distinguish extracted vessel type without extensive post-processing.
+
+### Original Statistic Information
+
+| Dataset name                                | Anatomical region | Task type    | Modality                     | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                         |
+| ------------------------------------------- | ----------------- | ------------ | ---------------------------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------- |
+| [Ravir](https://ravir.grand-challenge.org/) | eye               | segmentation | infrared reflectance imaging | 3            | 23/-/19               | yes/-/-                | 2022         | [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |     23     |   87.22    |    -     |    -     |     -     |     -     |
+|   artery   |     23     |    5.45    |    -     |    -     |     -     |     -     |
+|    vein    |     23     |    7.33    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/infrared_reflectance_imaging/ravir/ravir_dataset.png)
+
+## Dataset Citation
+
+```bibtex
+@article{hatamizadeh2022ravir,
+  title={RAVIR: A dataset and methodology for the semantic segmentation and quantitative analysis of retinal arteries and veins in infrared reflectance imaging},
+  author={Hatamizadeh, Ali and Hosseini, Hamid and Patel, Niraj and Choi, Jinseo and Pole, Cameron C and Hoeferlin, Cory M and Schwartz, Steven D and Terzopoulos, Demetri},
+  journal={IEEE Journal of Biomedical and Health Informatics},
+  volume={26},
+  number={7},
+  pages={3272--3283},
+  year={2022},
+  publisher={IEEE}
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `ravir/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://ravir.grand-challenge.org/) and decompression data to path `'data/ravir/'`.
+- run script `"python tools/prepare_dataset.py"` to split dataset and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py --data_root data/ravir"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── infrared_reflectance_imaging
+  │   │   │   │   ├── ravir
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   │   ├── test
+  │   │   │   │   |   │   │   │   ├── yyy.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── yyy.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |     18     |   87.41    |    5     |  86.53   |     -     |     -     |
+|   artery   |     18     |    5.44    |    5     |   5.50   |     -     |     -     |
+|    vein    |     18     |    7.15    |    5     |   7.97   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default）
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}
+```
+
+### Testing commands
+
+To train models on a single server with one GPU. (default）
+
+```shell
+mim test mmseg ./configs/${CONFIG_PATH}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Results
+
+### Ravir
+
+|     Method      | Backbone | Crop Size |   lr   |                                   config                                   |
+| :-------------: | :------: | :-------: | :----: | :------------------------------------------------------------------------: |
+| fcn_unet_s5-d16 |   unet   |  512x512  |  0.01  |  [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_ravir-512x512.py)  |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.001  | [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_ravir-512x512.py)  |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.0001 | [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_ravir-512x512.py) |
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_ravir-512x512.py b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_ravir-512x512.py
new file mode 100755
index 0000000000..375ad5abf2
--- /dev/null
+++ b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_ravir-512x512.py
@@ -0,0 +1,19 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './ravir_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.ravir_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_ravir-512x512.py b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_ravir-512x512.py
new file mode 100755
index 0000000000..a7ecf6dd45
--- /dev/null
+++ b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_ravir-512x512.py
@@ -0,0 +1,19 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './ravir_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.ravir_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=dict(size=img_scale),
+    pretrained=None,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_ravir-512x512.py b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_ravir-512x512.py
new file mode 100755
index 0000000000..28556df53d
--- /dev/null
+++ b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_ravir-512x512.py
@@ -0,0 +1,19 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './ravir_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.ravir_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    pretrained=None,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/ravir_512x512.py b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/ravir_512x512.py
new file mode 100755
index 0000000000..cb4c292d1f
--- /dev/null
+++ b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/configs/ravir_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'RAVIRDataset'
+data_root = 'data/ravir'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/infrared_reflectance_imaging/ravir/datasets/__init__.py b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/datasets/__init__.py
new file mode 100755
index 0000000000..6f1d051bcf
--- /dev/null
+++ b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/datasets/__init__.py
@@ -0,0 +1,3 @@
+from .ravir_dataset import RAVIRDataset
+
+__all__ = ['RAVIRDataset']
diff --git a/projects/medical/2d_image/infrared_reflectance_imaging/ravir/datasets/ravir_dataset.py b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/datasets/ravir_dataset.py
new file mode 100755
index 0000000000..c9e0a8ed21
--- /dev/null
+++ b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/datasets/ravir_dataset.py
@@ -0,0 +1,28 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class RAVIRDataset(BaseSegDataset):
+    """RAVIRDataset dataset.
+
+    In segmentation map annotation for RAVIRDataset, 0 stands for background,
+    which is included in 3 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is
+    fixed to '.png'.
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+    """
+    METAINFO = dict(classes=('background', 'artery', 'vein'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/infrared_reflectance_imaging/ravir/tools/prepare_dataset.py b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/tools/prepare_dataset.py
new file mode 100644
index 0000000000..068dcad814
--- /dev/null
+++ b/projects/medical/2d_image/infrared_reflectance_imaging/ravir/tools/prepare_dataset.py
@@ -0,0 +1,33 @@
+import glob
+import os
+
+import numpy as np
+from PIL import Image
+from tqdm import tqdm
+
+# map = {255:2, 128:1, 0:0}
+
+os.makedirs('data/ravir/images/train', exist_ok=True)
+os.makedirs('data/ravir/images/test', exist_ok=True)
+os.makedirs('data/ravir/masks/train', exist_ok=True)
+
+os.system(
+    r'cp data/ravir/RAVIR\ Dataset/train/training_images/* data/ravir/images/train'  # noqa
+)
+os.system(
+    r'cp data/ravir/RAVIR\ Dataset/train/training_masks/* data/ravir/masks/train'  # noqa
+)
+os.system(r'cp data/ravir/RAVIR\ Dataset/test/* data/ravir/images/test')
+
+os.system(r'rm -rf data/ravir/RAVIR\ Dataset')
+
+imgs = glob.glob(os.path.join('data/ravir/masks/train', '*.png'))
+
+for im_path in tqdm(imgs):
+    im = Image.open(im_path)
+    imn = np.array(im)
+    imn[imn == 255] = 2
+    imn[imn == 128] = 1
+    imn[imn == 0] = 0
+    new_im = Image.fromarray(imn)
+    new_im.save(im_path)
diff --git a/projects/medical/2d_image/microscopy_images/2pm_vessel/README.md b/projects/medical/2d_image/microscopy_images/2pm_vessel/README.md
new file mode 100644
index 0000000000..1feb433a31
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/2pm_vessel/README.md
@@ -0,0 +1,153 @@
+# 2-PM Vessel Dataset
+
+## Description
+
+This project supports **`2-PM Vessel Dataset`**, which can be downloaded from [here](https://opendatalab.org.cn/2-PM_Vessel_Dataset).
+
+### Dataset Overview
+
+An open-source volumetric brain vasculature dataset obtained with two-photon microscopy at Focused Ultrasound Lab, at Sunnybrook Research Institute (affiliated with University of Toronto by Dr. Alison Burgess, Charissa Poon and Marc Santos).
+
+The dataset contains a total of 12 volumetric stacks consisting images of mouse brain vasculature and tumor vasculature.
+
+### Information Statistics
+
+| Dataset Name                                                 | Anatomical Region | Task Type    | Modality          | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                       |
+| ------------------------------------------------------------ | ----------------- | ------------ | ----------------- | ------------ | --------------------- | ---------------------- | ------------ | ------------------------------------------------------------- |
+| [2pm_vessel](https://opendatalab.org.cn/2-PM_Vessel_Dataset) | vessel            | segmentation | microscopy_images | 2            | 216/-/-               | yes/-/-                | 2021         | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    216     |   85.78    |    -     |    -     |     -     |     -     |
+|   vessel   |    180     |   14.22    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![2pmv](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/histopathology/2pm_vessel/2pm_vessel_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@article{teikari2016deep,
+	title={Deep learning convolutional networks for multiphoton microscopy vasculature segmentation},
+	author={Teikari, Petteri and Santos, Marc and Poon, Charissa and Hynynen, Kullervo},
+	journal={arXiv preprint arXiv:1606.02382},
+	year={2016}
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `2pm_vessel/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://opendatalab.org.cn/2-PM_Vessel_Dataset) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```shell
+mkdir data & cd data
+pip install opendatalab
+odl get    2-PM_Vessel_Dataset
+cd ..
+python tools/prepare_dataset.py
+python ../../tools/split_seg_dataset.py
+```
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── microscopy_images
+  │   │   │   │   ├── 2pm_vessel
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    172     |   85.88    |    44    |   85.4   |     -     |     -     |
+|   vessel   |    142     |   14.12    |    38    |   14.6   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [ ] Milestone 2: Indicates a successful model implementation.
+
+  - [ ] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/2pm-vessel_512x512.py b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/2pm-vessel_512x512.py
new file mode 100644
index 0000000000..124403fa97
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/2pm-vessel_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'TwoPMVesselDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_2pm-vessel-512x512.py b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_2pm-vessel-512x512.py
new file mode 100644
index 0000000000..2a429e9068
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_2pm-vessel-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './2pm-vessel_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.2pm-vessel_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_2pm-vessel-512x512.py b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_2pm-vessel-512x512.py
new file mode 100644
index 0000000000..10d9bb82f2
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_2pm-vessel-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './2pm-vessel_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.2pm-vessel_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_2pm-vessel-512x512.py b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_2pm-vessel-512x512.py
new file mode 100644
index 0000000000..65c1579ec7
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_2pm-vessel-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './2pm-vessel_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.2pm-vessel_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_bactteria-detection-512x512.py b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_bactteria-detection-512x512.py
new file mode 100644
index 0000000000..91ed6ada3f
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/2pm_vessel/configs/fcn-unet-s5-d16_unet_1xb16-0.01lr-sigmoid-20k_bactteria-detection-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './2pm-vessel_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.2pm-vessel_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/microscopy_images/2pm_vessel/datasets/2pm-vessel_dataset.py b/projects/medical/2d_image/microscopy_images/2pm_vessel/datasets/2pm-vessel_dataset.py
new file mode 100644
index 0000000000..984b5a1361
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/2pm_vessel/datasets/2pm-vessel_dataset.py
@@ -0,0 +1,31 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class TwoPMVesselDataset(BaseSegDataset):
+    """TwoPMVesselDataset dataset.
+
+    In segmentation map annotation for TwoPMVesselDataset,
+    0 stands for background, which is included in 2 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('background', 'vessel'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/microscopy_images/2pm_vessel/tools/prepare_dataset.py b/projects/medical/2d_image/microscopy_images/2pm_vessel/tools/prepare_dataset.py
new file mode 100644
index 0000000000..1b46af2cad
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/2pm_vessel/tools/prepare_dataset.py
@@ -0,0 +1,46 @@
+import os
+
+import tifffile as tiff
+from PIL import Image
+
+root_path = 'data/'
+
+image_dir = os.path.join(root_path,
+                         '2-PM_Vessel_Dataset/raw/vesselNN_dataset/denoised')
+label_dir = os.path.join(root_path,
+                         '2-PM_Vessel_Dataset/raw/vesselNN_dataset/labels')
+tgt_img_train_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_train_dir = os.path.join(root_path, 'masks/train/')
+os.system('mkdir -p ' + tgt_img_train_dir)
+os.system('mkdir -p ' + tgt_mask_train_dir)
+
+
+def filter_suffix(src_dir, suffix):
+    suffix = '.' + suffix if '.' not in suffix else suffix
+    file_names = [_ for _ in os.listdir(src_dir) if _.endswith(suffix)]
+    file_paths = [os.path.join(src_dir, _) for _ in file_names]
+    return sorted(file_paths), sorted(file_names)
+
+
+if __name__ == '__main__':
+
+    image_path_list, _ = filter_suffix(image_dir, suffix='tif')
+    label_path_list, _ = filter_suffix(label_dir, suffix='.tif')
+
+    for img_path, label_path in zip(image_path_list, label_path_list):
+        labels = tiff.imread(label_path)
+        images = tiff.imread(img_path)
+        assert labels.ndim == 3
+        assert images.shape == labels.shape
+        name = img_path.split('/')[-1].replace('.tif', '')
+        # a single .tif file contains multiple slices
+        # as long as it is read by tifffile package.
+        for i in range(labels.shape[0]):
+            slice_name = name + '_' + str(i).rjust(3, '0') + '.png'
+            image = images[i]
+            label = labels[i] // 255
+
+            save_path_label = os.path.join(tgt_mask_train_dir, slice_name)
+            Image.fromarray(label).save(save_path_label)
+            save_path_image = os.path.join(tgt_img_train_dir, slice_name)
+            Image.fromarray(image).convert('RGB').save(save_path_image)
diff --git a/projects/medical/2d_image/microscopy_images/bactteria_detection/README.md b/projects/medical/2d_image/microscopy_images/bactteria_detection/README.md
new file mode 100644
index 0000000000..1cedda715a
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/bactteria_detection/README.md
@@ -0,0 +1,160 @@
+# Bactteria detection with darkfield microscopy
+
+## Description
+
+This project supports **`Bactteria detection with darkfield microscopy`**, which can be downloaded from [here](https://tianchi.aliyun.com/dataset/94411).
+
+### Dataset Overview
+
+Spirochaeta is a genus of bacteria classified within the phylum Spirochaetes. Included in this dataset are 366 darkfield microscopy images and manually annotated masks which can be used for classification and segmentation purposes. Detecting bacteria in blood could have a huge significance for research in both the medical and computer science field.
+
+It was gathered and annotated by students (hand-on experience)
+It has more than just one targeted class (blood cell and bacteria were annotated)
+It is highly imbalanced, so naive loss functions would work less properly
+
+### Original Statistic Information
+
+| Dataset name                                                    | Anatomical region | Task type    | Modality   | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                         |
+| --------------------------------------------------------------- | ----------------- | ------------ | ---------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------- |
+| [Bactteria detection](https://tianchi.aliyun.com/dataset/94411) | bacteria          | segmentation | microscopy | 3            | 366/-/-               | yes/-/-                | 2017         | [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+|  Class Name  | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :----------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|  background  |    366     |    85.9    |    -     |    -     |     -     |     -     |
+| erythrocytes |    345     |   13.03    |    -     |    -     |     -     |     -     |
+| spirochaete  |    288     |    1.07    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/microscopy_images/bactteria_detection/bactteria_detection_dataset.png)
+
+## Usage
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow (PIL) v9.3.0
+- scikit-learn (sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `bactteria_detection/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- Download dataset from [here](https://tianchi.aliyun.com/dataset/94411) and save it to the `data/` directory .
+- Decompress data to path `data/`. This will create a new folder named `data/Bacteria_detection_with_darkfield_microscopy_datasets/`, which contains the original image data.
+- run script `python tools/prepare_dataset.py` to format data and change folder structure as below.
+- run script `python ../../tools/split_seg_dataset.py` to split dataset. For the Bacteria_detection dataset, as there is no test or validation dataset, we sample 20% samples from the whole dataset as the validation dataset and 80% samples for training data and make two filename lists `train.txt` and `val.txt`. As we set the random seed as the hard code, we eliminated the randomness, the dataset split actually can be reproducible.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── microscopy_images
+  │   │   │   │   ├── bactteria_detection
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── Bacteria_detection_with_darkfield_microscopy_datasets
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+|  Class Name  | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :----------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|  background  |    292     |   85.66    |    74    |   86.7   |     -     |     -     |
+| erythrocytes |    274     |   13.25    |    71    |  12.29   |     -     |     -     |
+| spirochaete  |    231     |    1.09    |    57    |   1.01   |     -     |     -     |
+
+### Training commands
+
+Train models on a single server with one GPU.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+Test models on a single server with one GPU.
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Results
+
+### Bactteria detection with darkfield microscopy
+
+***Note: The following experimental results are based on the data randomly partitioned according to the above method described in the dataset preparing section.***
+
+|     Method      | Backbone | Crop Size |   lr   | mIoU  | mDice |                                          config                                          |         download         |
+| :-------------: | :------: | :-------: | :----: | :---: | :---: | :--------------------------------------------------------------------------------------: | :----------------------: |
+| fcn_unet_s5-d16 |   unet   |  512x512  |  0.01  | 76.48 | 84.68 |  [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_bactteria-detection-512x512.py)  | [model](<>) \| [log](<>) |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.001  | 61.06 | 63.69 | [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_bactteria-detection-512x512.py)  | [model](<>) \| [log](<>) |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.0001 | 58.87 | 62.42 | [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_bactteria-detection-512x512.py) | [model](<>) \| [log](<>) |
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/bactteria-detection_512x512.py b/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/bactteria-detection_512x512.py
new file mode 100644
index 0000000000..e3eab4e386
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/bactteria-detection_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'BactteriaDetectionDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_bactteria-detection-512x512.py b/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_bactteria-detection-512x512.py
new file mode 100644
index 0000000000..ede58d785c
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_bactteria-detection-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './bactteria-detection_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.bactteria-detection_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_bactteria-detection-512x512.py b/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_bactteria-detection-512x512.py
new file mode 100644
index 0000000000..bde3fa14ac
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_bactteria-detection-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './bactteria-detection_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.bactteria-detection_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_bactteria-detection-512x512.py b/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_bactteria-detection-512x512.py
new file mode 100644
index 0000000000..08e204f380
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/bactteria_detection/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_bactteria-detection-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './bactteria-detection_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.bactteria-detection_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=3),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/microscopy_images/bactteria_detection/datasets/bactteria-detection_dataset.py b/projects/medical/2d_image/microscopy_images/bactteria_detection/datasets/bactteria-detection_dataset.py
new file mode 100644
index 0000000000..c95097b1ac
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/bactteria_detection/datasets/bactteria-detection_dataset.py
@@ -0,0 +1,27 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class BactteriaDetectionDataset(BaseSegDataset):
+    """BactteriaDetectionDataset dataset.
+
+    In segmentation map annotation for BactteriaDetectionDataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+    """
+    METAINFO = dict(classes=('background', 'erythrocytes', 'spirochaete'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=False,
+            **kwargs)
diff --git a/projects/medical/2d_image/microscopy_images/bactteria_detection/tools/prepare_dataset.py b/projects/medical/2d_image/microscopy_images/bactteria_detection/tools/prepare_dataset.py
new file mode 100755
index 0000000000..8dcc719e26
--- /dev/null
+++ b/projects/medical/2d_image/microscopy_images/bactteria_detection/tools/prepare_dataset.py
@@ -0,0 +1,33 @@
+import glob
+import os
+import shutil
+
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.png'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+x_train = glob.glob(
+    'data/Bacteria_detection_with_darkfield_microscopy_datasets/images/*' +
+    img_suffix)  # noqa
+
+os.system('mkdir -p ' + root_path + 'images/train/')
+os.system('mkdir -p ' + root_path + 'masks/train/')
+
+part_dir_dict = {0: 'train/'}
+for ith, part in enumerate([x_train]):
+    part_dir = part_dir_dict[ith]
+    for img in part:
+        basename = os.path.basename(img)
+        img_save_path = os.path.join(root_path, 'images', part_dir,
+                                     basename.split('.')[0] + save_img_suffix)
+        shutil.copy(img, img_save_path)
+        mask_path = 'data/Bacteria_detection_with_darkfield_microscopy_datasets/masks/' + basename  # noqa
+        mask = Image.open(mask_path).convert('L')
+        mask_save_path = os.path.join(
+            root_path, 'masks', part_dir,
+            basename.split('.')[0] + save_seg_map_suffix)
+        mask.save(mask_save_path)
diff --git a/projects/medical/2d_image/tools/split_seg_dataset.py b/projects/medical/2d_image/tools/split_seg_dataset.py
new file mode 100644
index 0000000000..9ab2e9282f
--- /dev/null
+++ b/projects/medical/2d_image/tools/split_seg_dataset.py
@@ -0,0 +1,42 @@
+import argparse
+import glob
+import os
+
+from sklearn.model_selection import train_test_split
+
+
+def save_anno(img_list, file_path, remove_suffix=True):
+    if remove_suffix:
+        img_list = [
+            '/'.join(img_path.split('/')[-2:]) for img_path in img_list
+        ]
+        img_list = [
+            '.'.join(img_path.split('.')[:-1]) for img_path in img_list
+        ]
+    with open(file_path, 'w') as file_:
+        for x in list(img_list):
+            file_.write(x + '\n')
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--data_root', default='data/')
+    args = parser.parse_args()
+    data_root = args.data_root
+    if os.path.exists(os.path.join(data_root, 'masks/val')):
+        x_val = sorted(glob.glob(data_root + '/images/val/*.png'))
+        save_anno(x_val, data_root + '/val.txt')
+    if os.path.exists(os.path.join(data_root, 'masks/test')):
+        x_test = sorted(glob.glob(data_root + '/images/test/*.png'))
+        save_anno(x_test, data_root + '/test.txt')
+    if not os.path.exists(os.path.join(
+            data_root, 'masks/val')) and not os.path.exists(
+                os.path.join(data_root, 'masks/test')):
+        all_imgs = sorted(glob.glob(data_root + '/images/train/*.png'))
+        x_train, x_val = train_test_split(
+            all_imgs, test_size=0.2, random_state=0)
+        save_anno(x_train, data_root + '/train.txt')
+        save_anno(x_val, data_root + '/val.txt')
+    else:
+        x_train = sorted(glob.glob(data_root + '/images/train/*.png'))
+        save_anno(x_train, data_root + '/train.txt')
diff --git a/projects/medical/2d_image/x_ray/chest_image_pneum/README.md b/projects/medical/2d_image/x_ray/chest_image_pneum/README.md
new file mode 100644
index 0000000000..a1cd27ba45
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_image_pneum/README.md
@@ -0,0 +1,147 @@
+# Chest Image Dataset for Pneumothorax Segmentation
+
+## Description
+
+This project supports **`Chest Image Dataset for Pneumothorax Segmentation`**, which can be downloaded from [here](https://tianchi.aliyun.com/dataset/83075).
+
+### Dataset Overview
+
+Pneumothorax can be caused by a blunt chest injury, damage from underlying lung disease, or most horrifying—it may occur for no obvious reason at all. On some occasions, a collapsed lung can be a life-threatening event.
+Pneumothorax is usually diagnosed by a radiologist on a chest x-ray, and can sometimes be very difficult to confirm. An accurate AI algorithm to detect pneumothorax would be useful in a lot of clinical scenarios. AI could be used to triage chest radiographs for priority interpretation, or to provide a more confident diagnosis for non-radiologists.
+
+The dataset is provided by the Society for Imaging Informatics in Medicine(SIIM), American College of Radiology (ACR),Society of Thoracic Radiology (STR) and MD.ai. You can develop a model to classify (and if present, segment) pneumothorax from a set of chest radiographic images. If successful, you could aid in the early recognition of pneumothoraces and save lives.
+
+### Original Statistic Information
+
+| Dataset name                                                          | Anatomical region | Task type    | Modality | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                            |
+| --------------------------------------------------------------------- | ----------------- | ------------ | -------- | ------------ | --------------------- | ---------------------- | ------------ | ------------------------------------------------------------------ |
+| [pneumothorax segmentation](https://tianchi.aliyun.com/dataset/83075) | thorax            | segmentation | x_ray    | 2            | 12089/-/3205          | yes/-/no               | -            | [CC-BY-SA-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+|    Class Name     | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :---------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|      normal       |   12089    |   99.75    |    -     |    -     |     -     |     -     |
+| pneumothorax area |    2669    |    0.25    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![bac](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/x_ray/chest_image_pneum/chest_image_pneum_dataset.png)
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `chest_image_pneum/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://tianchi.aliyun.com/dataset/83075) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set can't be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── x_ray
+  │   │   │   │   ├── chest_image_pneum
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── test.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+|    Class Name     | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :---------------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|      normal       |    9637    |   99.75    |   2410   |  99.74   |     -     |     -     |
+| pneumothorax area |    2137    |    0.25    |   532    |   0.26   |     -     |     -     |
+
+### Training commands
+
+Train models on a single server with one GPU.
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+Test models on a single server with one GPU.
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Results
+
+### Bactteria detection with darkfield microscopy
+
+|     Method      | Backbone | Crop Size |   lr   | mIoU | mDice |                                         config                                         |         download         |
+| :-------------: | :------: | :-------: | :----: | :--: | :---: | :------------------------------------------------------------------------------------: | :----------------------: |
+| fcn_unet_s5-d16 |   unet   |  512x512  |  0.01  |  -   |   -   |  [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_chest-image-pneum-512x512.py)  | [model](<>) \| [log](<>) |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.001  |  -   |   -   | [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_chest-image-pneum-512x512.py)  | [model](<>) \| [log](<>) |
+| fcn_unet_s5-d16 |   unet   |  512x512  | 0.0001 |  -   |   -   | [config](./configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_chest-image-pneum-512x512.py) | [model](<>) \| [log](<>) |
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+
+  - [x] Basic docstrings & proper citation
+
+  - [x] Test-time correctness
+
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+
+  - [ ] Unit tests
+
+  - [ ] Code polishing
+
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/x_ray/chest_image_pneum/configs/chest-image-pneum_512x512.py b/projects/medical/2d_image/x_ray/chest_image_pneum/configs/chest-image-pneum_512x512.py
new file mode 100644
index 0000000000..411229bd41
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_image_pneum/configs/chest-image-pneum_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'ChestImagePneumDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_chest-image-pneum-512x512.py b/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_chest-image-pneum-512x512.py
new file mode 100644
index 0000000000..0f26459467
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_chest-image-pneum-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './chest-image-pneum_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.chest-image-pneum_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_chest-image-pneum-512x512.py b/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_chest-image-pneum-512x512.py
new file mode 100644
index 0000000000..37b91889d8
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_chest-image-pneum-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './chest-image-pneum_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.chest-image-pneum_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_chest-image-pneum-512x512.py b/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_chest-image-pneum-512x512.py
new file mode 100644
index 0000000000..379e8181f3
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_image_pneum/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_chest-image-pneum-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    './chest-image-pneum_512x512.py',
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.chest-image-pneum_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/chest_image_pneum/datasets/chest-image-pneum_dataset.py b/projects/medical/2d_image/x_ray/chest_image_pneum/datasets/chest-image-pneum_dataset.py
new file mode 100644
index 0000000000..aeee60ae92
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_image_pneum/datasets/chest-image-pneum_dataset.py
@@ -0,0 +1,27 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ChestImagePneumDataset(BaseSegDataset):
+    """ChestImagePneumDataset dataset.
+
+    In segmentation map annotation for ChestImagePneumDataset,
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+    """
+    METAINFO = dict(classes=('normal', 'pneumothorax area'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=False,
+            **kwargs)
diff --git a/projects/medical/2d_image/x_ray/chest_image_pneum/tools/prepare_dataset.py b/projects/medical/2d_image/x_ray/chest_image_pneum/tools/prepare_dataset.py
new file mode 100755
index 0000000000..47eddc96dc
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_image_pneum/tools/prepare_dataset.py
@@ -0,0 +1,73 @@
+import os
+
+import numpy as np
+import pandas as pd
+import pydicom
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.dcm'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+x_train = []
+for fpath, dirname, fnames in os.walk('data/chestimage_train_datasets'):
+    for fname in fnames:
+        if fname.endswith('.dcm'):
+            x_train.append(os.path.join(fpath, fname))
+x_test = []
+for fpath, dirname, fnames in os.walk('data/chestimage_test_datasets/'):
+    for fname in fnames:
+        if fname.endswith('.dcm'):
+            x_test.append(os.path.join(fpath, fname))
+
+os.system('mkdir -p ' + root_path + 'images/train/')
+os.system('mkdir -p ' + root_path + 'images/test/')
+os.system('mkdir -p ' + root_path + 'masks/train/')
+
+
+def rle_decode(rle, width, height):
+    mask = np.zeros(width * height, dtype=np.uint8)
+    array = np.asarray([int(x) for x in rle.split()])
+    starts = array[0::2]
+    lengths = array[1::2]
+
+    current_position = 0
+    for index, start in enumerate(starts):
+        current_position += start
+        mask[current_position:current_position + lengths[index]] = 1
+        current_position += lengths[index]
+
+    return mask.reshape(width, height, order='F')
+
+
+part_dir_dict = {0: 'train/', 1: 'test/'}
+dict_from_csv = pd.read_csv(
+    root_path + 'chestimage_train-rle_datasets.csv', sep=',',
+    index_col=0).to_dict()[' EncodedPixels']
+
+for ith, part in enumerate([x_train, x_test]):
+    part_dir = part_dir_dict[ith]
+    for img in part:
+        basename = os.path.basename(img)
+        img_id = '.'.join(basename.split('.')[:-1])
+        if ith == 0 and (img_id not in dict_from_csv.keys()):
+            continue
+        image = pydicom.read_file(img).pixel_array
+        save_img_path = root_path + 'images/' + part_dir + '.'.join(
+            basename.split('.')[:-1]) + save_img_suffix
+        print(save_img_path)
+        img_h, img_w = image.shape[:2]
+        image = Image.fromarray(image)
+        image.save(save_img_path)
+        if ith == 1:
+            continue
+        if dict_from_csv[img_id] == '-1':
+            mask = np.zeros((img_h, img_w), dtype=np.uint8)
+        else:
+            mask = rle_decode(dict_from_csv[img_id], img_h, img_w)
+        save_mask_path = root_path + 'masks/' + part_dir + '.'.join(
+            basename.split('.')[:-1]) + save_seg_map_suffix
+        mask = Image.fromarray(mask)
+        mask.save(save_mask_path)
diff --git a/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/README.md b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/README.md
new file mode 100644
index 0000000000..7cb099c8a4
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/README.md
@@ -0,0 +1,119 @@
+# Chest X-ray Images with Pneumothorax Masks
+
+## Description
+
+This project support **`Chest X-ray Images with Pneumothorax Masks `**, and the dataset used in this project can be downloaded from [here](https://www.kaggle.com/datasets/vbookshelf/pneumothorax-chest-xray-images-and-masks).
+
+### Dataset Overview
+
+A pneumothorax (noo-moe-THOR-aks) is a collapsed lung. A pneumothorax occurs when air leaks into the space between your lung and chest wall. This air pushes on the outside of your lung and makes it collapse. Pneumothorax can be a complete lung collapse or a collapse of only a portion of the lung.
+
+A pneumothorax can be caused by a blunt or penetrating chest injury, certain medical procedures, or damage from underlying lung disease. Or it may occur for no obvious reason. Symptoms usually include sudden chest pain and shortness of breath. On some occasions, a collapsed lung can be a life-threatening event.
+
+Treatment for a pneumothorax usually involves inserting a needle or chest tube between the ribs to remove the excess air. However, a small pneumothorax may heal on its own.
+
+### Statistic Information
+
+| Dataset Name                                                                                                                      | Anatomical Region | Task type    | Modality | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release date | License                                                         |
+| --------------------------------------------------------------------------------------------------------------------------------- | ----------------- | ------------ | -------- | ------------ | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------- |
+| [Chest-x-ray-images-with-pneumothorax-masks](https://www.kaggle.com/datasets/vbookshelf/pneumothorax-chest-xray-images-and-masks) | throax            | segmentation | x_ray    | 2            | 10675/-/1372          | yes/-/yes              | 2020         | [CC-BY-NC 4.0](https://creativecommons.org/licenses/by-sa/4.0/) |
+
+|  Class Name  | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :----------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+|  background  |   10675    |    99.7    |    -     |    -     |   1372    |   99.71   |
+| pneumothroax |    2379    |    0.3     |    -     |    -     |    290    |   0.29    |
+
+### Visualization
+
+![chest_x_ray_images_with_pneumothorax_masks](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/x_ray/chest_x_ray_images_with_pneumothorax_masks/chest_x_ray_images_with_pneumothorax_masks_dataset.png?raw=true)
+
+### Prerequisites
+
+- Python 3.8
+- PyTorch 1.10.0
+- pillow(PIL) 9.3.0
+- scikit-learn(sklearn) 1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of PYTHONPATH, which should point to the project's directory so that Python can locate the module files. In chest_x_ray_images_with_pneumothorax_masks/ root directory, run the following line to add the current directory to PYTHONPATH:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset preparing
+
+- download dataset from [here](https://www.kaggle.com/datasets/vbookshelf/pneumothorax-chest-xray-images-and-masks) and decompression data to path 'data/'.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── x_ray
+  │   │   │   │   ├── chest_x_ray_images_with_pneumothorax_masks
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Training commands
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}
+```
+
+To train on multiple GPUs, e.g. 8 GPUs, run the following command:
+
+```shell
+mim train mmseg ./configs/${CONFIG_PATH}  --launcher pytorch --gpus 8
+```
+
+### Testing commands
+
+```shell
+mim test mmseg ./configs/${CONFIG_PATH}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [x] Test-time correctness
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/chest-x-ray-images-with-pneumothorax-masks_512x512.py b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/chest-x-ray-images-with-pneumothorax-masks_512x512.py
new file mode 100644
index 0000000000..96676de861
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/chest-x-ray-images-with-pneumothorax-masks_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'ChestPenumoMaskDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py
new file mode 100644
index 0000000000..76c214d04c
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py
@@ -0,0 +1,20 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './chest-x-ray-images-with-pneumothorax-masks_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(
+    imports='datasets.chest-x-ray-images-with-pneumothorax-masks_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py
new file mode 100644
index 0000000000..066996dda9
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py
@@ -0,0 +1,19 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './chest-x-ray-images-with-pneumothorax-masks_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(
+    imports='datasets.chest-x-ray-images-with-pneumothorax-masks_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py
new file mode 100644
index 0000000000..a7065b8231
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py
@@ -0,0 +1,19 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './chest-x-ray-images-with-pneumothorax-masks_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(
+    imports='datasets.chest-x-ray-images-with-pneumothorax-masks_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py
new file mode 100644
index 0000000000..e5682ee76b
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_chest-x-ray-images-with-pneumothorax-masks-512x512.py
@@ -0,0 +1,19 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py',
+    './chest-x-ray-images-with-pneumothorax-masks_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(
+    imports='datasets.chest-x-ray-images-with-pneumothorax-masks_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/datasets/chest-x-ray-images-with-pneumothorax-masks_dataset.py b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/datasets/chest-x-ray-images-with-pneumothorax-masks_dataset.py
new file mode 100644
index 0000000000..d32f597a5a
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/datasets/chest-x-ray-images-with-pneumothorax-masks_dataset.py
@@ -0,0 +1,31 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class ChestPenumoMaskDataset(BaseSegDataset):
+    """ChestPenumoMaskDataset dataset.
+
+    In segmentation map annotation for ChestPenumoMaskDataset,
+    0 stands for background, which is included in 2 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('background', 'penumothroax'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/tools/prepare_dataset.py b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/tools/prepare_dataset.py
new file mode 100644
index 0000000000..c7de1f1904
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/chest_x_ray_images_with_pneumothorax_masks/tools/prepare_dataset.py
@@ -0,0 +1,36 @@
+import glob
+import os
+import shutil
+
+from PIL import Image
+from sklearn.model_selection import train_test_split
+
+root_path = 'data/'
+img_suffix = '.png'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+all_imgs = glob.glob('data/siim-acr-pneumothorax/png_images/*' + img_suffix)
+x_train, x_test = train_test_split(all_imgs, test_size=0.2, random_state=0)
+
+print(len(x_train), len(x_test))
+os.system('mkdir -p ' + root_path + 'images/train/')
+os.system('mkdir -p ' + root_path + 'images/val/')
+os.system('mkdir -p ' + root_path + 'masks/train/')
+os.system('mkdir -p ' + root_path + 'masks/val/')
+
+part_dir_dict = {0: 'train/', 1: 'val/'}
+for ith, part in enumerate([x_train, x_test]):
+    part_dir = part_dir_dict[ith]
+    for img in part:
+        basename = os.path.basename(img)
+        img_save_path = os.path.join(root_path, 'images', part_dir,
+                                     basename.split('.')[0] + save_img_suffix)
+        shutil.copy(img, img_save_path)
+        mask_path = 'data/siim-acr-pneumothorax/png_masks/' + basename
+        mask = Image.open(mask_path).convert('L')
+        mask_save_path = os.path.join(
+            root_path, 'masks', part_dir,
+            basename.split('.')[0] + save_seg_map_suffix)
+        mask.save(mask_save_path)
diff --git a/projects/medical/2d_image/x_ray/covid_19_ct_cxr/README.md b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/README.md
new file mode 100644
index 0000000000..8469219eff
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/README.md
@@ -0,0 +1,158 @@
+# Covid-19 CT Chest X-ray Dataset
+
+## Description
+
+This project supports **`Covid-19 CT Chest X-ray Dataset`**, which can be downloaded from [here](https://github.com/ieee8023/covid-chestxray-dataset).
+
+### Dataset Overview
+
+In the context of a COVID-19 pandemic, we want to improve prognostic predictions to triage and manage patient care. Data is the first step to developing any diagnostic/prognostic tool. While there exist large public datasets of more typical chest X-rays from the NIH \[Wang 2017\], Spain \[Bustos 2019\], Stanford \[Irvin 2019\], MIT \[Johnson 2019\] and Indiana University \[Demner-Fushman 2016\], there is no collection of COVID-19 chest X-rays or CT scans designed to be used for computational analysis.
+
+The 2019 novel coronavirus (COVID-19) presents several unique features [Fang, 2020](https://pubs.rsna.org/doi/10.1148/radiol.2020200432) and [Ai 2020](https://pubs.rsna.org/doi/10.1148/radiol.2020200642). While the diagnosis is confirmed using polymerase chain reaction (PCR), infected patients with pneumonia may present on chest X-ray and computed tomography (CT) images with a pattern that is only moderately characteristic for the human eye [Ng, 2020](https://pubs.rsna.org/doi/10.1148/ryct.2020200034). In late January, a Chinese team published a paper detailing the clinical and paraclinical features of COVID-19. They reported that patients present abnormalities in chest CT images with most having bilateral involvement [Huang 2020](<https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30183-5/fulltext>). Bilateral multiple lobular and subsegmental areas of consolidation constitute the typical findings in chest CT images of intensive care unit (ICU) patients on admission [Huang 2020](<https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30183-5/fulltext>). In comparison, non-ICU patients show bilateral ground-glass opacity and subsegmental areas of consolidation in their chest CT images [Huang 2020](<https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30183-5/fulltext>). In these patients, later chest CT images display bilateral ground-glass opacity with resolved consolidation [Huang 2020](<https://www.thelancet.com/journals/lancet/article/PIIS0140-6736(20)30183-5/fulltext>).
+
+### Statistic Information
+
+| Dataset Name                                                           | Anatomical Region | Task Type    | Modality | Nnum. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release date | License                                                               |
+| ---------------------------------------------------------------------- | ----------------- | ------------ | -------- | ------------- | --------------------- | ---------------------- | ------------ | --------------------------------------------------------------------- |
+| [Covid-19-ct-cxr](https://github.com/ieee8023/covid-chestxray-dataset) | thorax            | segmentation | x_ray    | 2             | 205/-/714             | yes/-/no               | 2021         | [CC-BY-NC-SA 4.0](https://creativecommons.org/licenses/by-nc-sa/4.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    205     |   72.84    |    -     |    -     |     -     |     -     |
+|    lung    |    205     |   27.16    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![cov19ctcxr](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/x_ray/covid_19_ct_cxr/covid_19_ct_cxr_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@article{cohen2020covidProspective,
+  title={{COVID-19} Image Data Collection: Prospective Predictions Are the Future},
+  author={Joseph Paul Cohen and Paul Morrison and Lan Dao and Karsten Roth and Tim Q Duong and Marzyeh Ghassemi},
+  journal={arXiv 2006.11988},
+  year={2020}
+}
+
+@article{cohen2020covid,
+  title={COVID-19 image data collection},
+  author={Joseph Paul Cohen and Paul Morrison and Lan Dao},
+  journal={arXiv 2003.11597},
+  year={2020}
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0 9.3.0
+- scikit-learn(sklearn) v1.2.0 1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `covid_19_ct_cxr/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://github.com/ieee8023/covid-chestxray-dataset) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```shell
+mkdir data && cd data
+git clone git@github.com:ieee8023/covid-chestxray-dataset.git
+cd ..
+python tools/prepare_dataset.py
+python ../../tools/split_seg_dataset.py
+```
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── x_ray
+  │   │   │   │   ├── covid_19_ct_cxr
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    164     |   72.88    |    41    |  72.69   |     -     |     -     |
+|    lung    |    164     |   27.12    |    41    |  27.31   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [x] Test-time correctness
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/covid-19-ct-cxr_512x512.py b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/covid-19-ct-cxr_512x512.py
new file mode 100644
index 0000000000..5242d06c37
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/covid-19-ct-cxr_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'Covid19CXRDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='val.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_covid-19-ct-cxr-512x512.py b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_covid-19-ct-cxr-512x512.py
new file mode 100644
index 0000000000..59a7bedaa0
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet-{use-sigmoid}_1xb16-0.01-20k_covid-19-ct-cxr-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './covid-19-ct-cxr_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.covid-19-ct-cxr_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_covid-19-ct-cxr-512x512.py b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_covid-19-ct-cxr-512x512.py
new file mode 100644
index 0000000000..83b8527d46
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_covid-19-ct-cxr-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './covid-19-ct-cxr_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.covid-19-ct-cxr_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_covid-19-ct-cxr-512x512.py b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_covid-19-ct-cxr-512x512.py
new file mode 100644
index 0000000000..10cfcbda6e
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_covid-19-ct-cxr-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './covid-19-ct-cxr_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.covid-19-ct-cxr_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_covid-19-ct-cxr-512x512.py b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_covid-19-ct-cxr-512x512.py
new file mode 100644
index 0000000000..aaccc8fd8d
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_covid-19-ct-cxr-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './covid-19-ct-cxr_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.covid-19-ct-cxr_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/covid_19_ct_cxr/datasets/covid-19-ct-cxr_dataset.py b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/datasets/covid-19-ct-cxr_dataset.py
new file mode 100644
index 0000000000..68a1bb331f
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/datasets/covid-19-ct-cxr_dataset.py
@@ -0,0 +1,31 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class Covid19CXRDataset(BaseSegDataset):
+    """Covid19CXRDataset dataset.
+
+    In segmentation map annotation for Covid19CXRDataset,
+    0 stands for background, which is included in 2 categories.
+    ``reduce_zero_label`` is fixed to False. The ``img_suffix``
+    is fixed to '.png' and ``seg_map_suffix`` is fixed to '.png'.
+
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False.
+    """
+    METAINFO = dict(classes=('background', 'lung'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/x_ray/covid_19_ct_cxr/tools/prepare_dataset.py b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/tools/prepare_dataset.py
new file mode 100644
index 0000000000..72f6435389
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/covid_19_ct_cxr/tools/prepare_dataset.py
@@ -0,0 +1,52 @@
+import os
+
+import numpy as np
+from PIL import Image
+
+root_path = 'data/'
+src_img_dir = os.path.join(root_path, 'covid-chestxray-dataset', 'images')
+src_mask_dir = os.path.join(root_path, 'covid-chestxray-dataset',
+                            'annotations/lungVAE-masks')
+tgt_img_train_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_train_dir = os.path.join(root_path, 'masks/train/')
+tgt_img_test_dir = os.path.join(root_path, 'images/test/')
+os.system('mkdir -p ' + tgt_img_train_dir)
+os.system('mkdir -p ' + tgt_mask_train_dir)
+os.system('mkdir -p ' + tgt_img_test_dir)
+
+
+def convert_label(img, convert_dict):
+    arr = np.zeros_like(img, dtype=np.uint8)
+    for c, i in convert_dict.items():
+        arr[img == c] = i
+    return arr
+
+
+if __name__ == '__main__':
+
+    all_img_names = os.listdir(src_img_dir)
+    all_mask_names = os.listdir(src_mask_dir)
+
+    for img_name in all_img_names:
+        base_name = img_name.replace('.png', '')
+        base_name = base_name.replace('.jpg', '')
+        base_name = base_name.replace('.jpeg', '')
+        mask_name_orig = base_name + '_mask.png'
+        if mask_name_orig in all_mask_names:
+            mask_name = base_name + '.png'
+            src_img_path = os.path.join(src_img_dir, img_name)
+            src_mask_path = os.path.join(src_mask_dir, mask_name_orig)
+            tgt_img_path = os.path.join(tgt_img_train_dir, img_name)
+            tgt_mask_path = os.path.join(tgt_mask_train_dir, mask_name)
+
+            img = Image.open(src_img_path).convert('RGB')
+            img.save(tgt_img_path)
+            mask = np.array(Image.open(src_mask_path))
+            mask = convert_label(mask, {0: 0, 255: 1})
+            mask = Image.fromarray(mask)
+            mask.save(tgt_mask_path)
+        else:
+            src_img_path = os.path.join(src_img_dir, img_name)
+            tgt_img_path = os.path.join(tgt_img_test_dir, img_name)
+            img = Image.open(src_img_path).convert('RGB')
+            img.save(tgt_img_path)
diff --git a/projects/medical/2d_image/x_ray/crass/README.md b/projects/medical/2d_image/x_ray/crass/README.md
new file mode 100644
index 0000000000..0621205be8
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/crass/README.md
@@ -0,0 +1,144 @@
+# Chest Radiograph Anatomical Structure Segmentation (CRASS)
+
+## Description
+
+This project supports **`Chest Radiograph Anatomical Structure Segmentation (CRASS) `**, which can be downloaded from [here](https://crass.grand-challenge.org/).
+
+### Dataset Overview
+
+A set of consecutively obtained posterior-anterior chest radiograph were selected from a database containing images acquired at two sites in sub Saharan Africa with a high tuberculosis incidence. All subjects were 15 years or older. Images from digital chest radiography units were used (Delft Imaging Systems, The Netherlands) of varying resolutions, with a typical resolution of 1800--2000 pixels, the pixel size was 250 lm isotropic. From the total set of images, 225 were considered to be normal by an expert radiologist, while 333 of the images contained abnormalities. Of the abnormal images, 220 contained abnormalities in the upper area of the lung where the clavicle is located. The data was divided into a training and a test set. The training set consisted of 299 images, the test set of 249 images.
+The current data is still incomplete and to be added later.
+
+### Information Statistics
+
+| Dataset Name                                | Anatomical Region | Task Type    | Modality | Num. Classes | Train/Val/Test Images | Train/Val/Test Labeled | Release Date | License                                                       |
+| ------------------------------------------- | ----------------- | ------------ | -------- | ------------ | --------------------- | ---------------------- | ------------ | ------------------------------------------------------------- |
+| [crass](https://crass.grand-challenge.org/) | pulmonary         | segmentation | x_ray    | 2            | 299/-/234             | yes/-/no               | 2021         | [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/) |
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    299     |   98.38    |    -     |    -     |     -     |     -     |
+| clavicles  |    299     |    1.62    |    -     |    -     |     -     |     -     |
+
+Note:
+
+- `Pct` means percentage of pixels in this category in all pixels.
+
+### Visualization
+
+![crass](https://raw.githubusercontent.com/uni-medical/medical-datasets-visualization/main/2d/semantic_seg/x_ray/crass/crass_dataset.png?raw=true)
+
+### Dataset Citation
+
+```
+@article{HOGEWEG20121490,
+	title={Clavicle segmentation in chest radiographs},
+	journal={Medical Image Analysis},
+	volume={16},
+	number={8},
+	pages={1490-1502},
+	year={2012}
+}
+```
+
+### Prerequisites
+
+- Python v3.8
+- PyTorch v1.10.0
+- pillow(PIL) v9.3.0
+- scikit-learn(sklearn) v1.2.0
+- [MIM](https://github.com/open-mmlab/mim) v0.3.4
+- [MMCV](https://github.com/open-mmlab/mmcv) v2.0.0rc4
+- [MMEngine](https://github.com/open-mmlab/mmengine) v0.2.0 or higher
+- [MMSegmentation](https://github.com/open-mmlab/mmsegmentation) v1.0.0rc5
+
+All the commands below rely on the correct configuration of `PYTHONPATH`, which should point to the project's directory so that Python can locate the module files. In `crass/` root directory, run the following line to add the current directory to `PYTHONPATH`:
+
+```shell
+export PYTHONPATH=`pwd`:$PYTHONPATH
+```
+
+### Dataset Preparing
+
+- download dataset from [here](https://crass.grand-challenge.org/) and decompress data to path `'data/'`.
+- run script `"python tools/prepare_dataset.py"` to format data and change folder structure as below.
+- run script `"python ../../tools/split_seg_dataset.py"` to split dataset and generate `train.txt`, `val.txt` and `test.txt`. If the label of official validation set and test set cannot be obtained, we generate `train.txt` and `val.txt` from the training set randomly.
+
+```none
+  mmsegmentation
+  ├── mmseg
+  ├── projects
+  │   ├── medical
+  │   │   ├── 2d_image
+  │   │   │   ├── x_ray
+  │   │   │   │   ├── crass
+  │   │   │   │   │   ├── configs
+  │   │   │   │   │   ├── datasets
+  │   │   │   │   │   ├── tools
+  │   │   │   │   │   ├── data
+  │   │   │   │   │   │   ├── train.txt
+  │   │   │   │   │   │   ├── val.txt
+  │   │   │   │   │   │   ├── images
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+  │   │   │   │   │   │   ├── masks
+  │   │   │   │   │   │   │   ├── train
+  │   │   │   │   |   │   │   │   ├── xxx.png
+  │   │   │   │   |   │   │   │   ├── ...
+  │   │   │   │   |   │   │   │   └── xxx.png
+```
+
+### Divided Dataset Information
+
+***Note: The table information below is divided by ourselves.***
+
+| Class Name | Num. Train | Pct. Train | Num. Val | Pct. Val | Num. Test | Pct. Test |
+| :--------: | :--------: | :--------: | :------: | :------: | :-------: | :-------: |
+| background |    227     |   98.38    |    57    |  98.39   |     -     |     -     |
+| clavicles  |    227     |    1.62    |    57    |   1.61   |     -     |     -     |
+
+### Training commands
+
+To train models on a single server with one GPU. (default)
+
+```shell
+mim train mmseg ./configs/${CONFIG_FILE}
+```
+
+### Testing commands
+
+To test models on a single server with one GPU. (default)
+
+```shell
+mim test mmseg ./configs/${CONFIG_FILE}  --checkpoint ${CHECKPOINT_PATH}
+```
+
+<!-- List the results as usually done in other model's README. [Example](https://github.com/open-mmlab/mmsegmentation/tree/dev-1.x/configs/fcn#results-and-models)
+
+You should claim whether this is based on the pre-trained weights, which are converted from the official release; or it's a reproduced result obtained from retraining the model in this project. -->
+
+## Checklist
+
+- [x] Milestone 1: PR-ready, and acceptable to be one of the `projects/`.
+
+  - [x] Finish the code
+  - [x] Basic docstrings & proper citation
+  - [ ] Test-time correctness
+  - [x] A full README
+
+- [x] Milestone 2: Indicates a successful model implementation.
+
+  - [x] Training-time correctness
+
+- [ ] Milestone 3: Good to be a part of our core package!
+
+  - [ ] Type hints and docstrings
+  - [ ] Unit tests
+  - [ ] Code polishing
+  - [ ] Metafile.yml
+
+- [ ] Move your modules into the core package following the codebase's file hierarchy structure.
+
+- [ ] Refactor your modules into the core package following the codebase's file hierarchy structure.
diff --git a/projects/medical/2d_image/x_ray/crass/configs/crass_512x512.py b/projects/medical/2d_image/x_ray/crass/configs/crass_512x512.py
new file mode 100644
index 0000000000..1425f50cc4
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/crass/configs/crass_512x512.py
@@ -0,0 +1,42 @@
+dataset_type = 'CRASSDataset'
+data_root = 'data/'
+img_scale = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=img_scale, keep_ratio=False),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs')
+]
+train_dataloader = dict(
+    batch_size=16,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='train.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='tval.txt',
+        data_prefix=dict(img_path='images/', seg_map_path='masks/'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
+test_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU', 'mDice'])
diff --git a/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_crass-512x512.py b/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_crass-512x512.py
new file mode 100644
index 0000000000..b52bc78f79
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.0001-20k_crass-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './crass_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.crass_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.0001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_crass-512x512.py b/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_crass-512x512.py
new file mode 100644
index 0000000000..45242c65b4
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.001-20k_crass-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './crass_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.crass_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.001)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_crass-512x512.py b/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_crass-512x512.py
new file mode 100644
index 0000000000..bcf9d0a5ca
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-0.01-20k_crass-512x512.py
@@ -0,0 +1,17 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './crass_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.crass_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(num_classes=2),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-lr0.01-sigmoid-20k_crass-512x512.py b/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-lr0.01-sigmoid-20k_crass-512x512.py
new file mode 100644
index 0000000000..0dde736bf7
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/crass/configs/fcn-unet-s5-d16_unet_1xb16-lr0.01-sigmoid-20k_crass-512x512.py
@@ -0,0 +1,18 @@
+_base_ = [
+    'mmseg::_base_/models/fcn_unet_s5-d16.py', './crass_512x512.py',
+    'mmseg::_base_/default_runtime.py',
+    'mmseg::_base_/schedules/schedule_20k.py'
+]
+custom_imports = dict(imports='datasets.crass_dataset')
+img_scale = (512, 512)
+data_preprocessor = dict(size=img_scale)
+optimizer = dict(lr=0.01)
+optim_wrapper = dict(optimizer=optimizer)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    decode_head=dict(
+        num_classes=2, loss_decode=dict(use_sigmoid=True), out_channels=1),
+    auxiliary_head=None,
+    test_cfg=dict(mode='whole', _delete_=True))
+vis_backends = None
+visualizer = dict(vis_backends=vis_backends)
diff --git a/projects/medical/2d_image/x_ray/crass/datasets/crass_dataset.py b/projects/medical/2d_image/x_ray/crass/datasets/crass_dataset.py
new file mode 100644
index 0000000000..f6b5c5228b
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/crass/datasets/crass_dataset.py
@@ -0,0 +1,30 @@
+from mmseg.datasets import BaseSegDataset
+from mmseg.registry import DATASETS
+
+
+@DATASETS.register_module()
+class CRASSDataset(BaseSegDataset):
+    """CRASSDataset dataset.
+
+    In segmentation map annotation for CRASSDataset, 0 stands for background,
+    which is included in 2 categories. ``reduce_zero_label`` is fixed to
+    False. The ``img_suffix`` is fixed to '.png' and ``seg_map_suffix`` is
+    fixed to '.png'.
+    Args:
+        img_suffix (str): Suffix of images. Default: '.png'
+        seg_map_suffix (str): Suffix of segmentation maps. Default: '.png'
+        reduce_zero_label (bool): Whether to mark label zero as ignored.
+            Default to False..
+    """
+    METAINFO = dict(classes=('background', 'clavicles'))
+
+    def __init__(self,
+                 img_suffix='.png',
+                 seg_map_suffix='.png',
+                 reduce_zero_label=False,
+                 **kwargs) -> None:
+        super().__init__(
+            img_suffix=img_suffix,
+            seg_map_suffix=seg_map_suffix,
+            reduce_zero_label=reduce_zero_label,
+            **kwargs)
diff --git a/projects/medical/2d_image/x_ray/crass/tools/prepare_dataset.py b/projects/medical/2d_image/x_ray/crass/tools/prepare_dataset.py
new file mode 100644
index 0000000000..bbd5d8891d
--- /dev/null
+++ b/projects/medical/2d_image/x_ray/crass/tools/prepare_dataset.py
@@ -0,0 +1,84 @@
+import glob
+import os
+
+import cv2
+import SimpleITK as sitk
+from PIL import Image
+
+root_path = 'data/'
+img_suffix = '.tif'
+seg_map_suffix = '.png'
+save_img_suffix = '.png'
+save_seg_map_suffix = '.png'
+
+src_img_train_dir = os.path.join(root_path, 'CRASS/data_train')
+src_mask_train_dir = os.path.join(root_path, 'CRASS/mask_mhd')
+src_img_test_dir = os.path.join(root_path, 'CRASS/data_test')
+
+tgt_img_train_dir = os.path.join(root_path, 'images/train/')
+tgt_mask_train_dir = os.path.join(root_path, 'masks/train/')
+tgt_img_test_dir = os.path.join(root_path, 'images/test/')
+os.system('mkdir -p ' + tgt_img_train_dir)
+os.system('mkdir -p ' + tgt_mask_train_dir)
+os.system('mkdir -p ' + tgt_img_test_dir)
+
+
+def filter_suffix_recursive(src_dir, suffix):
+    suffix = '.' + suffix if '.' not in suffix else suffix
+    file_paths = glob(
+        os.path.join(src_dir, '**', '*' + suffix), recursive=True)
+    file_names = [_.split('/')[-1] for _ in file_paths]
+    return sorted(file_paths), sorted(file_names)
+
+
+def read_single_array_from_med(path):
+    return sitk.GetArrayFromImage(sitk.ReadImage(path)).squeeze()
+
+
+def convert_meds_into_pngs(src_dir,
+                           tgt_dir,
+                           suffix='.dcm',
+                           norm_min=0,
+                           norm_max=255,
+                           convert='RGB'):
+    if not os.path.exists(tgt_dir):
+        os.makedirs(tgt_dir)
+
+    src_paths, src_names = filter_suffix_recursive(src_dir, suffix=suffix)
+    num = len(src_paths)
+    for i, (src_name, src_path) in enumerate(zip(src_names, src_paths)):
+        tgt_name = src_name.replace(suffix, '.png')
+        tgt_path = os.path.join(tgt_dir, tgt_name)
+
+        img = read_single_array_from_med(src_path)
+        if norm_min is not None and norm_max is not None:
+            img = cv2.normalize(img, None, norm_min, norm_max, cv2.NORM_MINMAX,
+                                cv2.CV_8U)
+        pil = Image.fromarray(img).convert(convert)
+        pil.save(tgt_path)
+        print(f'processed {i+1}/{num}.')
+
+
+convert_meds_into_pngs(
+    src_img_train_dir,
+    tgt_img_train_dir,
+    suffix='.mhd',
+    norm_min=0,
+    norm_max=255,
+    convert='RGB')
+
+convert_meds_into_pngs(
+    src_img_test_dir,
+    tgt_img_test_dir,
+    suffix='.mhd',
+    norm_min=0,
+    norm_max=255,
+    convert='RGB')
+
+convert_meds_into_pngs(
+    src_mask_train_dir,
+    tgt_mask_train_dir,
+    suffix='.mhd',
+    norm_min=0,
+    norm_max=1,
+    convert='L')
diff --git a/projects/nvidia_jetson/README.md b/projects/nvidia_jetson/README.md
new file mode 100644
index 0000000000..6cebd9caa1
--- /dev/null
+++ b/projects/nvidia_jetson/README.md
@@ -0,0 +1,372 @@
+# 将 MMSeg 模型调优及部署到 NVIDIA Jetson 平台教程
+
+- 请先查阅[MMSegmentation 模型部署](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/5_deployment.md)文档。
+- **本教程所用 mmsegmentation 版本： v1.1.2**
+- **本教程所用 NVIDIA Jetson 设备： NVIDIA Jetson AGX Orin 64G**
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/b5466cfd-71a9-4e06-9823-c253a97d57b5" alt="Smiley face" width="50%">
+</div>
+
+## 1 配置 [mmsegmentation](https://github.com/open-mmlab/mmsegmentation)
+
+- 根据[安装和验证](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/get_started.md)文档，完成开发 [mmsegmentation](https://github.com/open-mmlab/mmsegmentation) 所需的 [`pytorch`](https://pytorch.org/get-started/locally/)、[`mmcv`](https://github.com/open-mmlab/mmcv)、[`mmengine`](https://github.com/open-mmlab/mmengine) 等环境依赖安装。
+- 从 GitHub 使用 git clone 命令完成 [mmsegmentation](https://github.com/open-mmlab/mmsegmentation) 下载。网络不好的同学，可通过 [MMSeg GitHub](https://github.com/open-mmlab/mmsegmentation) 页面进行 zip 的下载。
+  ```bash
+  git clone https://github.com/open-mmlab/mmsegmentation.git
+  ```
+- 使用 `pip install -v -e.` 命令动态安装 mmsegmentation 。
+  ```bash
+  cd mmsegmentation
+  pip install -v -e .
+  ```
+  提示成功安装后，可通过 `pip list` 命令查看到 mmsegmentation 已通过本地安装方式安装到了您的环境中。
+  ![mmseg-install](https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/a9c7bcc9-cdcc-40a4-bd7b-8153195549c8)
+
+## 2 准备您的数据集
+
+- 本教程使用遥感图像语义分割数据集 [potsdam](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#isprs-potsdam) 作为示例。
+- 根据 [potsdam 数据准备](https://github.com/open-mmlab/mmsegmentation/blob/main/docs/zh_cn/user_guides/2_dataset_prepare.md#isprs-potsdam)文档，进行数据集下载及 MMSeg 格式的准备。
+- 数据集介绍： potsdam 数据集是以德国一个典型的历史城市 Potsdam 命名的，该城市有着大建筑群、狭窄的街道和密集的建筑结构。 potsdam 数据集包含 38 幅 6000x6000 像素的图像，空间分辨率为 5cm，数据集的示例如下图：
+  ![potsdam-img](https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/3bc0a75b-1693-4ae6-aeea-ad502e955068)
+
+## 3 从 config 页面下载模型的 pth 权重文件
+
+这里以 [`deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py`](../../configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py) 配置文件举例，在 [configs](https://github.com/open-mmlab/mmsegmentation/tree/main/configs/deeplabv3plus#potsdam) 页面下载权重文件，
+![pth](https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/8f747362-caf4-406c-808d-4ca72babb209)
+
+## 4 通过 [OpenMMLab deployee](https://platform.openmmlab.com/deploee) 以交互式方式进行模型转换及测速
+
+### 4.1 模型转换
+
+在该部分中，[OpenMMLab 官网](https://platform.openmmlab.com/deploee)提供了模型转换及模型测速的交互界面，无需任何代码，即可通过选择对应选项完成模型 ONNX 格式`xxxx.onnx` 和 TensorRT `.engine`格式的转换。
+如您的自定义 config 文件中有相对引用关系，如：
+
+```python
+# xxxx.py
+_base_ = [
+    '../_base_/models/deeplabv3plus_r50-d8.py',
+    '../_base_/datasets/potsdam.py',
+    '../_base_/default_runtime.py',
+    '../_base_/schedules/schedule_80k.py'
+]
+```
+
+您可以使用以下代码消除相对引用关系，以生成完整的 config 文件。
+
+```python
+import mmengine
+
+mmengine.Config.fromfile("configs/deeplabv3plus/deeplabv3plus_r101-d8_4xb4-80k_potsdam-512x512.py").dump("My_config.py")
+```
+
+使用上述代码后，您能够看到，在`My_config.py`包含着完整的配置文件，无相对引用。这时，上传模型 config 至网页内对应处。
+
+#### 创建转换任务
+
+按照下图提示及自己的需求，创建转换任务并提交。
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/4918d2f9-d63c-480f-97f1-054529770cfd" alt="NVIDIA-Jetson" width="80%">
+</div>
+
+### 4.2 模型测速
+
+在完成模型转换后可通过**模型测速**界面，完成在真实设备上的模型测速。
+
+#### 创建测速任务
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/27340556-c81a-4ce3-8560-2c4727d3355e" alt="NVIDIA-Jetson" width="100%">
+</div>
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/6f4fc3a9-ba9d-4829-8407-ed1470ba7bf3" alt="NVIDIA-Jetson" width="100%">
+</div>
+
+测速完成后，可在页面生成完整的测速报告。[查看测速报告示例](https://openmmlab-deploee.oss-cn-shanghai.aliyuncs.com/tmp/profile_speed/4352f5.txt)
+
+## 5 通过 OpenMMLab mmdeploy 以命令行将模型转换为ONNX格式
+
+该部分可以通过 mmdeploy 库对 mmseg 训练好的模型进行推理格式的转换。这里给出一个示例，具体文档可见[ mmdeploy 模型转换文档](../../docs/zh_cn/user_guides/5_deployment.md)。
+
+### 5.1 通过源码构建 mmdeploy 库
+
+在您安装 mmsegmentation 库的虚拟环境下，通过 `git clone`命令从 GitHub 克隆 [mmdeploy](https://github.com/open-mmlab/mmdeploy)
+
+### 5.2 模型转换
+
+如您的 config 中含有相对引用，仍需进行消除，如[4.1 模型转换](#4.1-模型转换)所述,
+进入 mmdeploy 文件夹，执行以下命令，即可完成模型转换。
+
+```bash
+python tools/deploy.py \
+    configs/mmseg/segmentation_onnxruntime_static-512x512.py \
+    ../atl_config.py \
+    ../deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth \
+    ../2_13_1024_5488_1536_6000.png \
+    --work-dir ../atl_models \
+    --device cpu \
+    --show \
+    --dump-info
+```
+
+```bash
+# 使用方法
+python ./tools/deploy.py \
+    ${部署配置文件路径} \
+    ${模型配置文件路径} \
+    ${模型权重路径} \
+    ${输入图像路径} \
+    --work-dir ${用来保存日志和模型文件路径} \
+    --device ${cpu/cuda:0} \
+    --show \    # 是否显示检测的结果
+    --dump-info # 是否输出 SDK 信息
+
+```
+
+执行成功后，您将能够看到以下提示，即为转换成功。
+
+```bash
+10/08 17:40:44 - mmengine - INFO - visualize pytorch model success.
+10/08 17:40:44 - mmengine - INFO - All process success.
+```
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/b752ccf8-903f-4ad3-ad7c-74fc25cb89a5" alt="NVIDIA-Jetson" width="400">
+</div>
+
+# 6 在 Jetson 平台进行转换及部署
+
+## 6.1 环境准备
+
+参考[如何在 Jetson 模组上安装 MMDeploy](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/01-how-to-build/jetsons.md)文档，完成在 Jetson 上的环境准备工作。
+**注**：安装 Pytorch，可查阅 [NVIDIA Jetson Pytorch 安装文档](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/01-how-to-build/jetsons.md)安装最新的 Pytorch。
+
+### 6.1.1 创建虚拟环境
+
+```bash
+conda create -n {您虚拟环境的名字} python={python版本}
+```
+
+### 6.1.2 虚拟环境内安装Pytorch
+
+<font color="red">注意：</font>这里不要安装最新的 pytorch 2.0，因为 pyTorch 1.11 是最后一个使用 USE_DISTRIBUTED 构建的wheel，否则会在用mmdeploy进行模型转换的时候提示`AttributeError: module 'torch.distributed' has no attribute 'ReduceOp'`的错误。参考以下链接：https://forums.developer.nvidia.com/t/module-torch-distributed-has-no-attribute-reduceop/256581/6
+下载`torch-1.11.0-cp38-cp38-linux_aarch64.whl`并安装
+
+```bash
+pip install torch-1.11.0-cp38-cp38-linux_aarch64.whl
+```
+
+执行以上命令后，您将能看到以下提示，即为安装成功。
+
+```bash
+Processing ./torch-1.11.0-cp38-cp38-linux_aarch64.whl
+Requirement already satisfied: typing-extensions in /home/sirs/miniconda3/envs/openmmlab/lib/python3.8/site-packages (from torch==1.11.0) (4.7.1)
+Installing collected packages: torch
+Successfully installed torch-1.11.0
+```
+
+### 6.1.3 将 Jetson Pack 自带的 tensorrt 拷贝至虚拟环境下
+
+请参考[配置 TensorRT](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/01-how-to-build/jetsons.md#%E9%85%8D%E7%BD%AE-tensorrt)。
+JetPack SDK 自带 TensorRT。 但是为了能够在 Conda 环境中成功导入，我们需要将 TensorRT 拷贝进先前创建的 Conda 环境中。
+
+```bash
+export PYTHON_VERSION=`python3 --version | cut -d' ' -f 2 | cut -d'.' -f1,2`
+cp -r /usr/lib/python${PYTHON_VERSION}/dist-packages/tensorrt* ~/miniconda/envs/{您的虚拟环境名字}/lib/python${PYTHON_VERSION}/site-packages/
+```
+
+### 6.1.4 安装 MMCV
+
+通过`mim install mmcv`或从源码对其进行编译。
+
+```bash
+pip install openmim
+mim install mmcv
+```
+
+或者从源码对其进行编译。
+
+```bash
+sudo apt-get install -y libssl-dev
+git clone https://github.com/open-mmlab/mmcv.git
+cd mmcv
+pip install -e .
+```
+
+<font color="red">注：pytorch版本发生变动后，需要重新编译mmcv。</font>
+
+### 6.1.5 安装 ONNX
+
+<font color="red">注：以下方式二选一</font>
+
+- conda
+  ```bash
+  conda install -c conda-forge onnx
+  ```
+- pip
+  ```bash
+  python3 -m pip install onnx
+  ```
+
+### 6.1.6 安装 ONNX Runtime
+
+根据网页 [ONNX Runtime](https://elinux.org/Jetson_Zoo#ONNX_Runtime) 选择合适的ONNX Runtime版本进行下载安装。
+示例：
+
+```bash
+# Install pip wheel
+$ pip3 install onnxruntime_gpu-1.10.0-cp38-cp38-linux_aarch64.whl
+
+```
+
+## 6.2 在 Jetson AGX Orin 进行模型转换及推理
+
+### 6.2.1 ONNX 模型转换
+
+同[4.1 模型转换](#4.1-模型转换)相同，在 Jetson 平台下进入安装好的虚拟环境，以及mmdeploy 目录，进行模型ONNX转换。
+
+```bash
+python tools/deploy.py \
+    configs/mmseg/segmentation_onnxruntime_static-512x512.py \
+    ../atl_config.py \
+    ../deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth \
+    ../2_13_3584_2560_4096_3072.png \
+    --work-dir ../atl_models \
+    --device cpu \
+    --show \
+    --dump-info
+
+```
+
+<font color="red">注：</font> 如果报错提示内容：
+
+```none
+AttributeError: module 'torch.distributed' has no attribute 'ReduceOp'
+```
+
+可参考以下链接进行解决：https://forums.developer.nvidia.com/t/module-torch-distributed-has-no-attribute-reduceop/256581/6，即安装 pytorch 1.11.0 版本。
+
+转换成功后，您将会看到如下信息以及包含 ONNX 模型的文件夹：
+
+```bash
+10/09 19:58:22 - mmengine - INFO - visualize pytorch model success.
+10/09 19:58:22 - mmengine - INFO - All process success.
+```
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/d68f1cf6-0e80-4261-91a3-6046b17de146" alt="NVIDIA-Jetson" width="400">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/70470a39-6a4f-4fd5-a06d-9b9d59a768ef" alt="NVIDIA-Jetson" width="160">
+</div>
+
+### 6.2.2 TensorRT 模型转换
+
+更换部署trt配置文件，进行 TensorRT 模型转换。
+
+```bash
+python tools/deploy.py \
+    configs/mmseg/segmentation_tensorrt_static-512x512.py \
+    ../atl_config.py \
+    ../deeplabv3plus_r18-d8_512x512_80k_potsdam_20211219_020601-75fd5bc3.pth \
+    ../2_13_3584_2560_4096_3072.png \
+    --work-dir ../atl_trt_models \
+    --device cuda:0 \
+    --show \
+    --dump-info
+
+```
+
+转换成功后您将看到以下信息及 TensorRT 模型文件夹：
+
+```bash
+10/09 20:15:50 - mmengine - INFO - visualize pytorch model success.
+10/09 20:15:50 - mmengine - INFO - All process success.
+```
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/2ac1428f-b787-4fdd-beaf-6397e5b21e33" alt="NVIDIA-Jetson" width="340">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/70470a39-6a4f-4fd5-a06d-9b9d59a768ef" alt="NVIDIA-Jetson" width="200">
+</div>
+
+## 6.3 模型测速
+
+执行以下命令完成模型测速，详细内容请查看[ profiler ](https://github.com/open-mmlab/mmdeploy/blob/main/docs/zh_cn/02-how-to-run/useful_tools.md#profiler)
+
+```bash
+python tools/profiler.py \
+    ${DEPLOY_CFG} \
+    ${MODEL_CFG} \
+    ${IMAGE_DIR} \
+    --model ${MODEL} \
+    --device ${DEVICE} \
+    --shape ${SHAPE} \
+    --num-iter ${NUM_ITER} \
+    --warmup ${WARMUP} \
+    --cfg-options ${CFG_OPTIONS} \
+    --batch-size ${BATCH_SIZE} \
+    --img-ext ${IMG_EXT}
+```
+
+示例：
+
+```bash
+python tools/profiler.py \
+    configs/mmseg/segmentation_tensorrt_static-512x512.py \
+    ../atl_config.py \
+    ../atl_demo_img \
+    --model /home/sirs/AI-Tianlong/OpenMMLab/atl_trt_models/end2end.engine \
+    --device cuda:0 \
+    --shape 512x512 \
+    --num-iter 100
+```
+
+测速结果
+
+![image](https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/874e9742-ee10-490c-9e69-17da0096c49b)
+
+## 6.4 模型推理
+
+根据[6.2.2](#6.2.2-TensorRT-模型转换)中生成的TensorRT模型文件夹，进行模型推理。
+
+```python
+from mmdeploy.apis.utils import build_task_processor
+from mmdeploy.utils import get_input_shape, load_config
+import torch
+
+deploy_cfg='./mmdeploy/configs/mmseg/segmentation_tensorrt_static-512x512.py'
+model_cfg='./atl_config.py'
+device='cuda:0'
+backend_model = ['./atl_trt_models/end2end.engine']
+image = './atl_demo_img/2_13_2048_1024_2560_1536.png'
+
+# read deploy_cfg and model_cfg
+deploy_cfg, model_cfg = load_config(deploy_cfg, model_cfg)
+
+# build task and backend model
+task_processor = build_task_processor(model_cfg, deploy_cfg, device)
+model = task_processor.build_backend_model(backend_model)
+
+# process input image
+input_shape = get_input_shape(deploy_cfg)
+model_inputs, _ = task_processor.create_input(image, input_shape)
+
+# do model inference
+with torch.no_grad():
+    result = model.test_step(model_inputs)
+
+# visualize results
+task_processor.visualize(
+    image=image,
+    model=model,
+    result=result[0],
+    window_name='visualize',
+    output_file='./output_segmentation.png')
+```
+
+即可得到推理结果：
+
+<div align="center">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/d0ae1fa8-e223-4b3f-b699-6bfa8db38133" alt="NVIDIA-Jetson" width="40%">
+    <img src="https://github.com/AI-Tianlong/Useful-Tools/assets/50650583/6d999cbe-2101-4e1b-b4a9-13115c9d1928" alt="NVIDIA-Jetson" width="40%">
+</div>
diff --git a/projects/pp_mobileseg/README.md b/projects/pp_mobileseg/README.md
new file mode 100644
index 0000000000..c9f9c128e7
--- /dev/null
+++ b/projects/pp_mobileseg/README.md
@@ -0,0 +1,123 @@
+# PP-MobileSeg: Exploring Transformer Blocks for Efficient Mobile Segmentation.
+
+## Reference
+
+> [PP-MobileSeg: Explore the Fast and Accurate Semantic Segmentation Model on Mobile Devices. ](https://arxiv.org/abs/2304.05152)
+
+## Introduction
+
+<a href="https://github.com/PaddlePaddle/PaddleSeg/tree/release/2.8">Official Repo</a>
+
+<a href="https://github.com/open-mmlab/mmsegmentation/tree/main/projects/pp_mobileseg">Code Snippet</a>
+
+## <img src="https://user-images.githubusercontent.com/34859558/190043857-bfbdaf8b-d2dc-4fff-81c7-e0aac50851f9.png" width="25"/> Abstract
+
+With the success of transformers in computer vision, several attempts have been made to adapt transformers to mobile devices. However, their performance is not satisfied for some real world applications. Therefore, we propose PP-MobileSeg, a SOTA semantic segmentation model for mobile devices.
+
+It is composed of three newly proposed parts, the strideformer backbone, the Aggregated Attention Module(AAM), and the Valid Interpolate Module(VIM):
+
+- With the four-stage MobileNetV3 block as the feature extractor, we manage to extract rich local features of different receptive fields with little parameter overhead. Also, we further efficiently empower features from the last two stages with the global view using strided sea attention.
+- To effectively fuse the features, we use AAM to filter the detail features with ensemble voting and add the semantic feature to it to enhance the semantic information to the most content.
+- At last, we use VIM to upsample the downsampled feature to the original resolution and significantly decrease latency in model inference stage. It only interpolates classes present in the final prediction which only takes around 10% in the ADE20K dataset. This is a common scenario for datasets with large classes. Therefore it significantly decreases the latency of the final upsample process which takes the greatest part of the model's overall latency.
+
+Extensive experiments show that PP-MobileSeg achieves a superior params-accuracy-latency tradeoff compared to other SOTA methods.
+
+<div align="center">
+<img src="https://user-images.githubusercontent.com/34859558/227450728-1338fcb1-3b8a-4453-a155-da60abcacb88.png"  width = "1000" />
+</div>
+
+## <img src="https://user-images.githubusercontent.com/34859558/190044217-8f6befc2-7f20-473d-b356-148e06265205.png" width="25"/> Performance
+
+### ADE20K
+
+| Model             | Backbone          | Training Iters | Batchsize | Train Resolution | mIoU(%) | latency(ms)\* | params(M) | config                                                                                                                    | Links                                                                                                                                                                                                                                |
+| ----------------- | ----------------- | -------------- | --------- | ---------------- | ------- | ------------- | --------- | ------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| PP-MobileSeg-Base | StrideFormer-Base | 80000          | 32        | 512x512          | 41.57%  | 265.5         | 5.62      | [config](https://github.com/Yang-Changhui/mmsegmentation/tree/add_ppmobileseg/projects/pp_mobileseg/configs/pp_mobileseg) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pp_mobileseg/pp_mobileseg_mobilenetv3_2xb16_3rdparty-base_512x512-ade20k-f12b44f3.pth)\|[log](https://bj.bcebos.com/paddleseg/dygraph/ade20k/pp_mobileseg_base/train.log) |
+| PP-MobileSeg-Tiny | StrideFormer-Tiny | 80000          | 32        | 512x512          | 36.39%  | 215.3         | 1.61      | [config](https://github.com/Yang-Changhui/mmsegmentation/tree/add_ppmobileseg/projects/pp_mobileseg/configs/pp_mobileseg) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/pp_mobileseg/pp_mobileseg_mobilenetv3_2xb16_3rdparty-tiny_512x512-ade20k-a351ebf5.pth)\|[log](https://bj.bcebos.com/paddleseg/dygraph/ade20k/pp_mobileseg_tiny/train.log) |
+
+## Usage
+
+Same as other models in MMsegmentation, you can run the following command to test the model at ${MMSEG_ROOT}:
+
+```shell
+./tools/dist_test.sh projects/pp_mobileseg/configs/pp_mobileseg/pp_mobileseg_mobilenetv3_2x16_80k_ade20k_512x512_base.py checkpoints/pp_mobileseg_mobilenetv3_2xb16_3rdparty-base_512x512-ade20k-f12b44f3.pth 8
+```
+
+## Inference with ONNXRuntime
+
+### Prerequisites
+
+**1. Install onnxruntime inference engine.**
+
+Choose one of the following ways to install onnxruntime.
+
+- CPU version
+
+```shell
+pip install onnxruntime==1.15.1
+wget https://github.com/microsoft/onnxruntime/releases/download/v1.15.1/onnxruntime-linux-x64-1.15.1.tgz
+tar -zxvf onnxruntime-linux-x64-1.15.1.tgz
+export ONNXRUNTIME_DIR=$(pwd)/onnxruntime-linux-x64-1.15.1
+export LD_LIBRARY_PATH=$ONNXRUNTIME_DIR/lib:$LD_LIBRARY_PATH
+```
+
+**2. Convert model to onnx file**
+
+- Install `mim` and `mmdeploy`.
+
+```shell
+pip install openmim
+mim install mmdeploy
+git clone https://github.com/open-mmlab/mmdeploy.git
+```
+
+- Download pp_mobileseg model.
+
+```shell
+wget https://download.openmmlab.com/mmsegmentation/v0.5/pp_mobileseg/pp_mobileseg_mobilenetv3_2xb16_3rdparty-tiny_512x512-ade20k-a351ebf5.pth
+```
+
+- Convert model to onnx files.
+
+```shell
+python mmdeploy/tools/deploy.py mmdeploy/configs/mmseg/segmentation_onnxruntime_dynamic.py \
+    configs/pp_mobileseg/pp_mobileseg_mobilenetv3_2x16_80k_ade20k_512x512_tiny.py \
+    pp_mobileseg_mobilenetv3_2xb16_3rdparty-tiny_512x512-ade20k-a351ebf5.pth \
+    ../../demo/demo.png \
+    --work-dir mmdeploy_model/mmseg/ort \
+    --show
+```
+
+**3. Run demo**
+
+```shell
+python inference_onnx.py ${ONNX_FILE_PATH} ${IMAGE_PATH} [${MODEL_INPUT_SIZE} ${DEVICE} ${OUTPUT_IMAGE_PATH}]
+```
+
+Example:
+
+```shell
+python inference_onnx.py mmdeploy_model/mmseg/ort/end2end.onnx ../../demo/demo.png
+```
+
+## Citation
+
+If you find our project useful in your research, please consider citing:
+
+```
+@misc{liu2021paddleseg,
+      title={PaddleSeg: A High-Efficient Development Toolkit for Image Segmentation},
+      author={Yi Liu and Lutao Chu and Guowei Chen and Zewu Wu and Zeyu Chen and Baohua Lai and Yuying Hao},
+      year={2021},
+      eprint={2101.06175},
+      archivePrefix={arXiv},
+      primaryClass={cs.CV}
+}
+
+@misc{paddleseg2019,
+    title={PaddleSeg, End-to-end image segmentation kit based on PaddlePaddle},
+    author={PaddlePaddle Contributors},
+    howpublished = {\url{https://github.com/PaddlePaddle/PaddleSeg}},
+    year={2019}
+}
+```
diff --git a/projects/pp_mobileseg/backbones/__init__.py b/projects/pp_mobileseg/backbones/__init__.py
new file mode 100644
index 0000000000..244b33d37a
--- /dev/null
+++ b/projects/pp_mobileseg/backbones/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .strideformer import StrideFormer
+
+__all__ = ['StrideFormer']
diff --git a/projects/pp_mobileseg/backbones/strideformer.py b/projects/pp_mobileseg/backbones/strideformer.py
new file mode 100644
index 0000000000..3f09be5225
--- /dev/null
+++ b/projects/pp_mobileseg/backbones/strideformer.py
@@ -0,0 +1,958 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import math
+import warnings
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_activation_layer
+from mmcv.cnn.bricks.transformer import build_dropout
+from mmengine.logging import print_log
+from mmengine.model import BaseModule
+from mmengine.runner.checkpoint import CheckpointLoader, load_state_dict
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class StrideFormer(BaseModule):
+    """The StrideFormer implementation based on torch.
+
+    The original article refers to:https://arxiv.org/abs/2304.05152
+    Args:
+        mobileV3_cfg(list): Each sublist describe the config for a
+            MobileNetV3 block.
+        channels(list): The input channels for each MobileNetV3 block.
+        embed_dims(list): The channels of the features input to the sea
+            attention block.
+        key_dims(list, optional): The embeding dims for each head in
+            attention.
+        depths(list, optional): describes the depth of the attention block.
+            i,e: M,N.
+        num_heads(int, optional): The number of heads of the attention
+            blocks.
+        attn_ratios(int, optional): The expand ratio of V.
+        mlp_ratios(list, optional): The ratio of mlp blocks.
+        drop_path_rate(float, optional): The drop path rate in attention
+            block.
+        act_cfg(dict, optional): The activation layer of AAM:
+            Aggregate Attention Module.
+        inj_type(string, optional): The type of injection/AAM.
+        out_channels(int, optional): The output channels of the AAM.
+        dims(list, optional): The dimension of the fusion block.
+        out_feat_chs(list, optional): The input channels of the AAM.
+        stride_attention(bool, optional): whether to stride attention in
+            each attention layer.
+        pretrained(str, optional): the path of pretrained model.
+    """
+
+    def __init__(
+        self,
+        mobileV3_cfg,
+        channels,
+        embed_dims,
+        key_dims=[16, 24],
+        depths=[2, 2],
+        num_heads=8,
+        attn_ratios=2,
+        mlp_ratios=[2, 4],
+        drop_path_rate=0.1,
+        act_cfg=dict(type='ReLU'),
+        inj_type='AAM',
+        out_channels=256,
+        dims=(128, 160),
+        out_feat_chs=None,
+        stride_attention=True,
+        pretrained=None,
+        init_cfg=None,
+    ):
+        super().__init__(init_cfg=init_cfg)
+        assert not (init_cfg and pretrained
+                    ), 'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depths = depths
+        self.cfgs = mobileV3_cfg
+        self.dims = dims
+        for i in range(len(self.cfgs)):
+            smb = StackedMV3Block(
+                cfgs=self.cfgs[i],
+                stem=True if i == 0 else False,
+                in_channels=channels[i],
+            )
+            setattr(self, f'smb{i + 1}', smb)
+        for i in range(len(depths)):
+            dpr = [
+                x.item() for x in torch.linspace(0, drop_path_rate, depths[i])
+            ]
+            trans = BasicLayer(
+                block_num=depths[i],
+                embedding_dim=embed_dims[i],
+                key_dim=key_dims[i],
+                num_heads=num_heads,
+                mlp_ratio=mlp_ratios[i],
+                attn_ratio=attn_ratios,
+                drop=0,
+                attn_drop=0.0,
+                drop_path=dpr,
+                act_cfg=act_cfg,
+                stride_attention=stride_attention,
+            )
+            setattr(self, f'trans{i + 1}', trans)
+
+        self.inj_type = inj_type
+        if self.inj_type == 'AAM':
+            self.inj_module = InjectionMultiSumallmultiallsum(
+                in_channels=out_feat_chs, out_channels=out_channels)
+            self.feat_channels = [
+                out_channels,
+            ]
+        elif self.inj_type == 'AAMSx8':
+            self.inj_module = InjectionMultiSumallmultiallsumSimpx8(
+                in_channels=out_feat_chs, out_channels=out_channels)
+            self.feat_channels = [
+                out_channels,
+            ]
+        elif self.inj_type == 'origin':
+            for i in range(len(dims)):
+                fuse = FusionBlock(
+                    out_feat_chs[0] if i == 0 else dims[i - 1],
+                    out_feat_chs[i + 1],
+                    embed_dim=dims[i],
+                    act_cfg=None,
+                )
+                setattr(self, f'fuse{i + 1}', fuse)
+            self.feat_channels = [
+                dims[i],
+            ]
+        else:
+            raise NotImplementedError(self.inj_module + ' is not implemented')
+
+        self.pretrained = pretrained
+        # self.init_weights()
+
+    def init_weights(self):
+        if (isinstance(self.init_cfg, dict)
+                and self.init_cfg.get('type') == 'Pretrained'):
+            checkpoint = CheckpointLoader.load_checkpoint(
+                self.init_cfg['checkpoint'], logger=None, map_location='cpu')
+
+            if 'state_dict' in checkpoint:
+                state_dict = checkpoint['state_dict']
+            else:
+                state_dict = checkpoint
+
+            if 'pos_embed' in state_dict.keys():
+                if self.pos_embed.shape != state_dict['pos_embed'].shape:
+                    print_log(msg=f'Resize the pos_embed shape from '
+                              f'{state_dict["pos_embed"].shape} to '
+                              f'{self.pos_embed.shape}')
+                    h, w = self.img_size
+                    pos_size = int(
+                        math.sqrt(state_dict['pos_embed'].shape[1] - 1))
+                    state_dict['pos_embed'] = self.resize_pos_embed(
+                        state_dict['pos_embed'],
+                        (h // self.patch_size, w // self.patch_size),
+                        (pos_size, pos_size),
+                        self.interpolate_mode,
+                    )
+
+            load_state_dict(self, state_dict, strict=False, logger=None)
+
+    def forward(self, x):
+        x_hw = x.shape[2:]
+        outputs = []
+        num_smb_stage = len(self.cfgs)
+        num_trans_stage = len(self.depths)
+
+        for i in range(num_smb_stage):
+            smb = getattr(self, f'smb{i + 1}')
+            x = smb(x)
+
+            # 1/8 shared feat
+            if i == 1:
+                outputs.append(x)
+            if num_trans_stage + i >= num_smb_stage:
+                trans = getattr(
+                    self, f'trans{i + num_trans_stage - num_smb_stage + 1}')
+                x = trans(x)
+                outputs.append(x)
+        if self.inj_type == 'origin':
+            x_detail = outputs[0]
+            for i in range(len(self.dims)):
+                fuse = getattr(self, f'fuse{i + 1}')
+
+                x_detail = fuse(x_detail, outputs[i + 1])
+            output = x_detail
+        else:
+            output = self.inj_module(outputs)
+
+        return [output, x_hw]
+
+
+class StackedMV3Block(nn.Module):
+    """The MobileNetV3 block.
+
+    Args:
+        cfgs (list): The MobileNetV3 config list of a stage.
+        stem (bool): Whether is the first stage or not.
+        in_channels (int, optional): The channels of input image. Default: 3.
+        scale: float=1.0.
+        The coefficient that controls the size of network parameters.
+
+    Returns:
+        model: nn.Module.
+        A stage of specific MobileNetV3 model depends on args.
+    """
+
+    def __init__(self,
+                 cfgs,
+                 stem,
+                 in_channels,
+                 scale=1.0,
+                 norm_cfg=dict(type='BN')):
+        super().__init__()
+
+        self.scale = scale
+        self.stem = stem
+
+        if self.stem:
+            self.conv = ConvModule(
+                in_channels=3,
+                out_channels=_make_divisible(in_channels * self.scale),
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                groups=1,
+                bias=False,
+                norm_cfg=norm_cfg,
+                act_cfg=dict(type='HSwish'),
+            )
+
+        self.blocks = nn.ModuleList()
+        for i, (k, exp, c, se, act, s) in enumerate(cfgs):
+            self.blocks.append(
+                ResidualUnit(
+                    in_channel=_make_divisible(in_channels * self.scale),
+                    mid_channel=_make_divisible(self.scale * exp),
+                    out_channel=_make_divisible(self.scale * c),
+                    kernel_size=k,
+                    stride=s,
+                    use_se=se,
+                    act=act,
+                    dilation=1,
+                ))
+            in_channels = _make_divisible(self.scale * c)
+
+    def forward(self, x):
+        if self.stem:
+            x = self.conv(x)
+        for i, block in enumerate(self.blocks):
+            x = block(x)
+
+        return x
+
+
+class ResidualUnit(nn.Module):
+    """The Residual module.
+
+    Args:
+        in_channel (int, optional): The channels of input feature.
+        mid_channel (int, optional): The channels of middle process.
+        out_channel (int, optional): The channels of output feature.
+        kernel_size (int, optional): The size of the convolving kernel.
+        stride (int, optional): The stride size.
+        use_se (bool, optional): if to use the SEModule.
+        act (string, optional): activation layer.
+        dilation (int, optional): The dilation size.
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN', requires_grad=True).
+    """
+
+    def __init__(
+            self,
+            in_channel,
+            mid_channel,
+            out_channel,
+            kernel_size,
+            stride,
+            use_se,
+            act=None,
+            dilation=1,
+            norm_cfg=dict(type='BN'),
+    ):
+        super().__init__()
+        self.if_shortcut = stride == 1 and in_channel == out_channel
+        self.if_se = use_se
+        self.expand_conv = ConvModule(
+            in_channels=in_channel,
+            out_channels=mid_channel,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type=act) if act is not None else None,
+        )
+        self.bottleneck_conv = ConvModule(
+            in_channels=mid_channel,
+            out_channels=mid_channel,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=int((kernel_size - 1) // 2) * dilation,
+            bias=False,
+            groups=mid_channel,
+            dilation=dilation,
+            norm_cfg=norm_cfg,
+            act_cfg=dict(type=act) if act is not None else None,
+        )
+        if self.if_se:
+            self.mid_se = SEModule(mid_channel)
+        self.linear_conv = ConvModule(
+            in_channels=mid_channel,
+            out_channels=out_channel,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None,
+        )
+
+    def forward(self, x):
+        identity = x
+        x = self.expand_conv(x)
+        x = self.bottleneck_conv(x)
+        if self.if_se:
+            x = self.mid_se(x)
+        x = self.linear_conv(x)
+        if self.if_shortcut:
+            x = torch.add(identity, x)
+        return x
+
+
+class SEModule(nn.Module):
+    """SE Module.
+
+    Args:
+        channel (int, optional): The channels of input feature.
+        reduction (int, optional): The channel reduction rate.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+    """
+
+    def __init__(self, channel, reduction=4, act_cfg=dict(type='ReLU')):
+        super().__init__()
+        self.avg_pool = nn.AdaptiveAvgPool2d(1)
+        self.conv_act1 = ConvModule(
+            in_channels=channel,
+            out_channels=channel // reduction,
+            kernel_size=1,
+            norm_cfg=None,
+            act_cfg=act_cfg,
+        )
+
+        self.conv_act2 = ConvModule(
+            in_channels=channel // reduction,
+            out_channels=channel,
+            kernel_size=1,
+            norm_cfg=None,
+            act_cfg=dict(type='Hardsigmoid', slope=0.2, offset=0.5),
+        )
+
+    def forward(self, x):
+        identity = x
+        x = self.avg_pool(x)
+        x = self.conv_act1(x)
+        x = self.conv_act2(x)
+        return torch.mul(identity, x)
+
+
+class BasicLayer(nn.Module):
+    """The transformer basic layer.
+
+    Args:
+        block_num (int): the block nums of the transformer basic layer.
+        embedding_dim (int): The feature dimension.
+        key_dim (int): the key dim.
+        num_heads (int): Parallel attention heads.
+        mlp_ratio (float): the mlp ratio.
+        attn_ratio (float): the attention ratio.
+        drop (float): Probability of an element to be zeroed
+            after the feed forward layer.Default: 0.0.
+        attn_drop (float): The drop out rate for attention layer.
+            Default: 0.0.
+        drop_path (float): stochastic depth rate. Default 0.0.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        stride_attention (bool, optional): whether to stride attention in
+            each attention layer.
+    """
+
+    def __init__(
+        self,
+        block_num,
+        embedding_dim,
+        key_dim,
+        num_heads,
+        mlp_ratio=4.0,
+        attn_ratio=2.0,
+        drop=0.0,
+        attn_drop=0.0,
+        drop_path=None,
+        act_cfg=None,
+        stride_attention=None,
+    ):
+        super().__init__()
+        self.block_num = block_num
+
+        self.transformer_blocks = nn.ModuleList()
+        for i in range(self.block_num):
+            self.transformer_blocks.append(
+                Block(
+                    embedding_dim,
+                    key_dim=key_dim,
+                    num_heads=num_heads,
+                    mlp_ratio=mlp_ratio,
+                    attn_ratio=attn_ratio,
+                    drop=drop,
+                    drop_path=drop_path[i]
+                    if isinstance(drop_path, list) else drop_path,
+                    act_cfg=act_cfg,
+                    stride_attention=stride_attention,
+                ))
+
+    def forward(self, x):
+        for i in range(self.block_num):
+            x = self.transformer_blocks[i](x)
+        return x
+
+
+class Block(nn.Module):
+    """the block of the transformer basic layer.
+
+    Args:
+        dim (int): The feature dimension.
+        key_dim (int): The key dimension.
+        num_heads (int): Parallel attention heads.
+        mlp_ratio (float): the mlp ratio.
+        attn_ratio (float): the attention ratio.
+        drop (float): Probability of an element to be zeroed
+            after the feed forward layer.Default: 0.0.
+        drop_path (float): stochastic depth rate. Default 0.0.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        stride_attention (bool, optional): whether to stride attention in
+            each attention layer.
+    """
+
+    def __init__(
+        self,
+        dim,
+        key_dim,
+        num_heads,
+        mlp_ratio=4.0,
+        attn_ratio=2.0,
+        drop=0.0,
+        drop_path=0.0,
+        act_cfg=None,
+        stride_attention=None,
+    ):
+        super().__init__()
+        self.dim = dim
+        self.num_heads = num_heads
+        self.mlp_ratio = mlp_ratio
+        self.attn = SeaAttention(
+            dim,
+            key_dim=key_dim,
+            num_heads=num_heads,
+            attn_ratio=attn_ratio,
+            act_cfg=act_cfg,
+            stride_attention=stride_attention,
+        )
+        self.drop_path = (
+            build_dropout(dict(type='DropPath', drop_prob=drop_path))
+            if drop_path > 0.0 else nn.Identity())
+        mlp_hidden_dim = int(dim * mlp_ratio)
+        self.mlp = MLP(
+            in_features=dim,
+            hidden_features=mlp_hidden_dim,
+            act_cfg=act_cfg,
+            drop=drop,
+        )
+
+    def forward(self, x1):
+        x1 = x1 + self.drop_path(self.attn(x1))
+        x1 = x1 + self.drop_path(self.mlp(x1))
+
+        return x1
+
+
+class SqueezeAxialPositionalEmbedding(nn.Module):
+    """the Squeeze Axial Positional Embedding.
+
+    Args:
+        dim (int): The feature dimension.
+        shape (int): The patch size.
+    """
+
+    def __init__(self, dim, shape):
+        super().__init__()
+        self.pos_embed = nn.init.normal_(
+            nn.Parameter(torch.zeros(1, dim, shape)))
+
+    def forward(self, x):
+        B, C, N = x.shape
+        x = x + F.interpolate(
+            self.pos_embed, size=(N, ), mode='linear', align_corners=False)
+        return x
+
+
+class SeaAttention(nn.Module):
+    """The sea attention.
+
+    Args:
+        dim (int): The feature dimension.
+        key_dim (int):  The key dimension.
+        num_heads (int): number of attention heads.
+        attn_ratio (float): the attention ratio.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='LN')
+        stride_attention (bool, optional): whether to stride attention in
+            each attention layer.
+    """
+
+    def __init__(
+            self,
+            dim,
+            key_dim,
+            num_heads,
+            attn_ratio=4.0,
+            act_cfg=None,
+            norm_cfg=dict(type='BN'),
+            stride_attention=False,
+    ):
+
+        super().__init__()
+        self.num_heads = num_heads
+        self.scale = key_dim**-0.5
+        self.nh_kd = nh_kd = key_dim * num_heads
+        self.d = int(attn_ratio * key_dim)
+        self.dh = int(attn_ratio * key_dim) * num_heads
+        self.attn_ratio = attn_ratio
+
+        self.to_q = ConvModule(
+            dim, nh_kd, 1, bias=False, norm_cfg=norm_cfg, act_cfg=None)
+        self.to_k = ConvModule(
+            dim, nh_kd, 1, bias=False, norm_cfg=norm_cfg, act_cfg=None)
+
+        self.to_v = ConvModule(
+            dim, self.dh, 1, bias=False, norm_cfg=norm_cfg, act_cfg=None)
+        self.stride_attention = stride_attention
+        if self.stride_attention:
+            self.stride_conv = ConvModule(
+                dim,
+                dim,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=True,
+                groups=dim,
+                norm_cfg=norm_cfg,
+                act_cfg=None,
+            )
+
+        self.proj = ConvModule(
+            self.dh,
+            dim,
+            1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            order=('act', 'conv', 'norm'),
+        )
+        self.proj_encode_row = ConvModule(
+            self.dh,
+            self.dh,
+            1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            order=('act', 'conv', 'norm'),
+        )
+        self.pos_emb_rowq = SqueezeAxialPositionalEmbedding(nh_kd, 16)
+        self.pos_emb_rowk = SqueezeAxialPositionalEmbedding(nh_kd, 16)
+        self.proj_encode_column = ConvModule(
+            self.dh,
+            self.dh,
+            1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+            order=('act', 'conv', 'norm'),
+        )
+        self.pos_emb_columnq = SqueezeAxialPositionalEmbedding(nh_kd, 16)
+        self.pos_emb_columnk = SqueezeAxialPositionalEmbedding(nh_kd, 16)
+        self.dwconv = ConvModule(
+            2 * self.dh,
+            2 * self.dh,
+            3,
+            padding=1,
+            groups=2 * self.dh,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg,
+        )
+        self.pwconv = ConvModule(
+            2 * self.dh, dim, 1, bias=False, norm_cfg=norm_cfg, act_cfg=None)
+        self.sigmoid = build_activation_layer(dict(type='HSigmoid'))
+
+    def forward(self, x):
+        B, C, H_ori, W_ori = x.shape
+        if self.stride_attention:
+            x = self.stride_conv(x)
+        B, C, H, W = x.shape
+
+        q = self.to_q(x)  # [B, nhead*dim, H, W]
+        k = self.to_k(x)
+        v = self.to_v(x)
+
+        qkv = torch.cat([q, k, v], dim=1)
+        qkv = self.dwconv(qkv)
+        qkv = self.pwconv(qkv)
+
+        qrow = (self.pos_emb_rowq(q.mean(-1)).reshape(
+            [B, self.num_heads, -1, H]).permute(
+                (0, 1, 3, 2)))  # [B, nhead, H, dim]
+        krow = self.pos_emb_rowk(k.mean(-1)).reshape(
+            [B, self.num_heads, -1, H])  # [B, nhead, dim, H]
+        vrow = (v.mean(-1).reshape([B, self.num_heads, -1,
+                                    H]).permute([0, 1, 3, 2])
+                )  # [B, nhead, H, dim*attn_ratio]
+
+        attn_row = torch.matmul(qrow, krow) * self.scale  # [B, nhead, H, H]
+        attn_row = nn.functional.softmax(attn_row, dim=-1)
+
+        xx_row = torch.matmul(attn_row, vrow)  # [B, nhead, H, dim*attn_ratio]
+        xx_row = self.proj_encode_row(
+            xx_row.permute([0, 1, 3, 2]).reshape([B, self.dh, H, 1]))
+
+        # squeeze column
+        qcolumn = (
+            self.pos_emb_columnq(q.mean(-2)).reshape(
+                [B, self.num_heads, -1, W]).permute([0, 1, 3, 2]))
+        kcolumn = self.pos_emb_columnk(k.mean(-2)).reshape(
+            [B, self.num_heads, -1, W])
+        vcolumn = (
+            torch.mean(v, -2).reshape([B, self.num_heads, -1,
+                                       W]).permute([0, 1, 3, 2]))
+
+        attn_column = torch.matmul(qcolumn, kcolumn) * self.scale
+        attn_column = nn.functional.softmax(attn_column, dim=-1)
+
+        xx_column = torch.matmul(attn_column, vcolumn)  # B nH W C
+        xx_column = self.proj_encode_column(
+            xx_column.permute([0, 1, 3, 2]).reshape([B, self.dh, 1, W]))
+
+        xx = torch.add(xx_row, xx_column)  # [B, self.dh, H, W]
+        xx = torch.add(v, xx)
+
+        xx = self.proj(xx)
+        xx = self.sigmoid(xx) * qkv
+        if self.stride_attention:
+            xx = F.interpolate(xx, size=(H_ori, W_ori), mode='bilinear')
+
+        return xx
+
+
+class MLP(nn.Module):
+    """the Multilayer Perceptron.
+
+    Args:
+        in_features (int): the input feature.
+        hidden_features (int): the hidden feature.
+        out_features (int): the output feature.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='PReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+        drop (float): Probability of an element to be zeroed.
+            Default 0.0
+    """
+
+    def __init__(
+            self,
+            in_features,
+            hidden_features=None,
+            out_features=None,
+            act_cfg=None,
+            norm_cfg=dict(type='BN'),
+            drop=0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        self.fc1 = ConvModule(
+            in_features,
+            hidden_features,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None,
+        )
+        self.dwconv = ConvModule(
+            hidden_features,
+            hidden_features,
+            kernel_size=3,
+            padding=1,
+            groups=hidden_features,
+            norm_cfg=None,
+            act_cfg=act_cfg,
+        )
+
+        self.fc2 = ConvModule(
+            hidden_features,
+            out_features,
+            1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None,
+        )
+        self.drop = build_dropout(dict(type='Dropout', drop_prob=drop))
+
+    def forward(self, x):
+        x = self.fc1(x)
+        x = self.dwconv(x)
+        x = self.drop(x)
+        x = self.fc2(x)
+        x = self.drop(x)
+        return x
+
+
+class FusionBlock(nn.Module):
+    """The feature fusion block.
+
+    Args:
+        in_channel (int): the input channel.
+        out_channel (int): the output channel.
+        embed_dim (int): embedding dimension.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(
+            self,
+            in_channel,
+            out_channel,
+            embed_dim,
+            norm_cfg=dict(type='BN'),
+            act_cfg=dict(type='ReLU'),
+    ) -> None:
+        super().__init__()
+        self.local_embedding = ConvModule(
+            in_channels=in_channel,
+            out_channels=embed_dim,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=None,
+        )
+
+        self.global_act = ConvModule(
+            in_channels=out_channel,
+            out_channels=embed_dim,
+            kernel_size=1,
+            bias=False,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg if act_cfg is not None else None,
+        )
+
+    def forward(self, x_l, x_g):
+        """
+        x_g: global features
+        x_l: local features
+        """
+        B, C, H, W = x_l.shape
+
+        local_feat = self.local_embedding(x_l)
+        global_act = self.global_act(x_g)
+        sig_act = F.interpolate(
+            global_act, size=(H, W), mode='bilinear', align_corners=False)
+
+        out = local_feat * sig_act
+
+        return out
+
+
+class InjectionMultiSumallmultiallsum(nn.Module):
+    """the Aggregate Attention Module.
+
+    Args:
+        in_channels (tuple): the input channel.
+        out_channels (int): the output channel.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(
+            self,
+            in_channels=(64, 128, 256, 384),
+            out_channels=256,
+            act_cfg=dict(type='Sigmoid'),
+            norm_cfg=dict(type='BN'),
+    ):
+        super().__init__()
+        self.embedding_list = nn.ModuleList()
+        self.act_embedding_list = nn.ModuleList()
+        self.act_list = nn.ModuleList()
+        for i in range(len(in_channels)):
+            self.embedding_list.append(
+                ConvModule(
+                    in_channels=in_channels[i],
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    bias=False,
+                    norm_cfg=norm_cfg,
+                    act_cfg=None,
+                ))
+            self.act_embedding_list.append(
+                ConvModule(
+                    in_channels=in_channels[i],
+                    out_channels=out_channels,
+                    kernel_size=1,
+                    bias=False,
+                    norm_cfg=norm_cfg,
+                    act_cfg=act_cfg,
+                ))
+
+    def forward(self, inputs):  # x_x8, x_x16, x_x32, x_x64
+        low_feat1 = F.interpolate(inputs[0], scale_factor=0.5, mode='bilinear')
+        low_feat1_act = self.act_embedding_list[0](low_feat1)
+        low_feat1 = self.embedding_list[0](low_feat1)
+
+        low_feat2 = F.interpolate(
+            inputs[1], size=low_feat1.shape[-2:], mode='bilinear')
+        low_feat2_act = self.act_embedding_list[1](low_feat2)  # x16
+        low_feat2 = self.embedding_list[1](low_feat2)
+
+        high_feat_act = F.interpolate(
+            self.act_embedding_list[2](inputs[2]),
+            size=low_feat2.shape[2:],
+            mode='bilinear',
+        )
+        high_feat = F.interpolate(
+            self.embedding_list[2](inputs[2]),
+            size=low_feat2.shape[2:],
+            mode='bilinear')
+
+        res = (
+            low_feat1_act * low_feat2_act * high_feat_act *
+            (low_feat1 + low_feat2) + high_feat)
+
+        return res
+
+
+class InjectionMultiSumallmultiallsumSimpx8(nn.Module):
+    """the Aggregate Attention Module.
+
+    Args:
+        in_channels (tuple): the input channel.
+        out_channels (int): the output channel.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN')
+    """
+
+    def __init__(
+            self,
+            in_channels=(64, 128, 256, 384),
+            out_channels=256,
+            act_cfg=dict(type='Sigmoid'),
+            norm_cfg=dict(type='BN'),
+    ):
+        super().__init__()
+        self.embedding_list = nn.ModuleList()
+        self.act_embedding_list = nn.ModuleList()
+        self.act_list = nn.ModuleList()
+        for i in range(len(in_channels)):
+            if i != 1:
+                self.embedding_list.append(
+                    ConvModule(
+                        in_channels=in_channels[i],
+                        out_channels=out_channels,
+                        kernel_size=1,
+                        bias=False,
+                        norm_cfg=norm_cfg,
+                        act_cfg=None,
+                    ))
+            if i != 0:
+                self.act_embedding_list.append(
+                    ConvModule(
+                        in_channels=in_channels[i],
+                        out_channels=out_channels,
+                        kernel_size=1,
+                        bias=False,
+                        norm_cfg=norm_cfg,
+                        act_cfg=act_cfg,
+                    ))
+
+    def forward(self, inputs):
+        # x_x8, x_x16, x_x32
+        low_feat1 = self.embedding_list[0](inputs[0])
+
+        low_feat2 = F.interpolate(
+            inputs[1], size=low_feat1.shape[-2:], mode='bilinear')
+        low_feat2_act = self.act_embedding_list[0](low_feat2)
+
+        high_feat_act = F.interpolate(
+            self.act_embedding_list[1](inputs[2]),
+            size=low_feat2.shape[2:],
+            mode='bilinear',
+        )
+        high_feat = F.interpolate(
+            self.embedding_list[1](inputs[2]),
+            size=low_feat2.shape[2:],
+            mode='bilinear')
+
+        res = low_feat2_act * high_feat_act * low_feat1 + high_feat
+
+        return res
+
+
+def _make_divisible(v, divisor=8, min_value=None):
+    if min_value is None:
+        min_value = divisor
+    new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
+    if new_v < 0.9 * v:
+        new_v += divisor
+    return new_v
+
+
+@MODELS.register_module()
+class Hardsigmoid(nn.Module):
+    """the hardsigmoid activation.
+
+    Args:
+        slope (float, optional): The slope of hardsigmoid function.
+            Default is 0.1666667.
+        offset (float, optional): The offset of hardsigmoid function.
+            Default is 0.5.
+        inplace (bool): can optionally do the operation in-place.
+            Default: ``False``
+    """
+
+    def __init__(self, slope=0.1666667, offset=0.5, inplace=False):
+        super().__init__()
+        self.slope = slope
+        self.offset = offset
+
+    def forward(self, x):
+        return (x * self.slope + self.offset).clamp(0, 1)
diff --git a/projects/pp_mobileseg/configs/_base_/datasets/ade20k.py b/projects/pp_mobileseg/configs/_base_/datasets/ade20k.py
new file mode 100644
index 0000000000..48340d11ee
--- /dev/null
+++ b/projects/pp_mobileseg/configs/_base_/datasets/ade20k.py
@@ -0,0 +1,68 @@
+# dataset settings
+dataset_type = 'ADE20KDataset'
+data_root = 'data/ade/ADEChallengeData2016'
+crop_size = (512, 512)
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(
+        type='RandomResize',
+        scale=(2048, 512),
+        ratio_range=(0.5, 2.0),
+        keep_ratio=True),
+    dict(type='RandomCrop', crop_size=crop_size, cat_max_ratio=0.75),
+    dict(type='RandomFlip', prob=0.5),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs')
+]
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
+tta_pipeline = [
+    dict(type='LoadImageFromFile', backend_args=None),
+    dict(
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in img_ratios
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='LoadAnnotations')], [dict(type='PackSegInputs')]
+        ])
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='InfiniteSampler', shuffle=True),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/training', seg_map_path='annotations/training'),
+        pipeline=train_pipeline))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='images/validation',
+            seg_map_path='annotations/validation'),
+        pipeline=test_pipeline))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(type='IoUMetric', iou_metrics=['mIoU'])
+test_evaluator = val_evaluator
diff --git a/projects/pp_mobileseg/configs/_base_/default_runtime.py b/projects/pp_mobileseg/configs/_base_/default_runtime.py
new file mode 100644
index 0000000000..272b4d2467
--- /dev/null
+++ b/projects/pp_mobileseg/configs/_base_/default_runtime.py
@@ -0,0 +1,15 @@
+default_scope = 'mmseg'
+env_cfg = dict(
+    cudnn_benchmark=True,
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
+    dist_cfg=dict(backend='nccl'),
+)
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='SegLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+log_processor = dict(by_epoch=False)
+log_level = 'INFO'
+load_from = None
+resume = False
+
+tta_model = dict(type='SegTTAModel')
diff --git a/projects/pp_mobileseg/configs/_base_/models/pp_mobile.py b/projects/pp_mobileseg/configs/_base_/models/pp_mobile.py
new file mode 100644
index 0000000000..0c7695636f
--- /dev/null
+++ b/projects/pp_mobileseg/configs/_base_/models/pp_mobile.py
@@ -0,0 +1,47 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255)
+
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    # pretrained='open-mmlab://resnet50_v1c',
+    backbone=dict(
+        type='StrideFormer',
+        mobileV3_cfg=[
+            # k t c, s
+            [[3, 16, 16, True, 'ReLU', 1], [3, 64, 32, False, 'ReLU', 2],
+             [3, 96, 32, False, 'ReLU', 1]],  # cfg1
+            [[5, 128, 64, True, 'HSwish', 2], [5, 240, 64, True, 'HSwish',
+                                               1]],  # cfg2
+            [[5, 384, 128, True, 'HSwish', 2],
+             [5, 384, 128, True, 'HSwish', 1]],  # cfg3
+            [[5, 768, 192, True, 'HSwish', 2],
+             [5, 768, 192, True, 'HSwish', 1]],  # cfg4
+        ],
+        channels=[16, 32, 64, 128, 192],
+        depths=[3, 3],
+        embed_dims=[128, 192],
+        num_heads=8,
+        inj_type='AAMSx8',
+        out_feat_chs=[64, 128, 192],
+        act_cfg=dict(type='ReLU6'),
+    ),
+    decode_head=dict(
+        type='PPMobileSegHead',
+        num_classes=150,
+        in_channels=256,
+        dropout_ratio=0.1,
+        use_dw=True,
+        act_cfg=dict(type='ReLU'),
+        align_corners=False),
+
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/projects/pp_mobileseg/configs/_base_/schedules/schedule_80k.py b/projects/pp_mobileseg/configs/_base_/schedules/schedule_80k.py
new file mode 100644
index 0000000000..0dcd6c4d1b
--- /dev/null
+++ b/projects/pp_mobileseg/configs/_base_/schedules/schedule_80k.py
@@ -0,0 +1,24 @@
+# optimizer
+optimizer = dict(type='SGD', lr=0.01, momentum=0.9, weight_decay=0.0005)
+optim_wrapper = dict(type='OptimWrapper', optimizer=optimizer, clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        eta_min=1e-4,
+        power=0.9,
+        begin=0,
+        end=80000,
+        by_epoch=False)
+]
+# training schedule for 80k
+train_cfg = dict(type='IterBasedTrainLoop', max_iters=80000, val_interval=8000)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=8000),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/projects/pp_mobileseg/configs/pp_mobileseg/pp_mobileseg_mobilenetv3_2x16_80k_ade20k_512x512_base.py b/projects/pp_mobileseg/configs/pp_mobileseg/pp_mobileseg_mobilenetv3_2x16_80k_ade20k_512x512_base.py
new file mode 100644
index 0000000000..4b68a927e2
--- /dev/null
+++ b/projects/pp_mobileseg/configs/pp_mobileseg/pp_mobileseg_mobilenetv3_2x16_80k_ade20k_512x512_base.py
@@ -0,0 +1,18 @@
+_base_ = [
+    '../_base_/models/pp_mobile.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+# the custom import path is determined by your workspace path (i.e., where you run the command from) # noqa
+custom_imports = dict(
+    imports=[
+        'projects.pp_mobileseg.backbones', 'projects.pp_mobileseg.decode_head'
+    ],
+    allow_failed_imports=False)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pp_mobileseg/pp_mobileseg_mobilenetv3_3rdparty-base-ed0be681.pth'  # noqa
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size, test_cfg=dict(size_divisor=32))
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(init_cfg=dict(type='Pretrained', checkpoint=checkpoint)),
+    decode_head=dict(num_classes=150, upsample='intepolate'))
diff --git a/projects/pp_mobileseg/configs/pp_mobileseg/pp_mobileseg_mobilenetv3_2x16_80k_ade20k_512x512_tiny.py b/projects/pp_mobileseg/configs/pp_mobileseg/pp_mobileseg_mobilenetv3_2x16_80k_ade20k_512x512_tiny.py
new file mode 100644
index 0000000000..b78869e517
--- /dev/null
+++ b/projects/pp_mobileseg/configs/pp_mobileseg/pp_mobileseg_mobilenetv3_2x16_80k_ade20k_512x512_tiny.py
@@ -0,0 +1,45 @@
+_base_ = [
+    '../_base_/models/pp_mobile.py', '../_base_/datasets/ade20k.py',
+    '../_base_/default_runtime.py', '../_base_/schedules/schedule_80k.py'
+]
+# the custom import path is determined by your workspace path (i.e., where you run the command from) # noqa
+custom_imports = dict(
+    imports=[
+        'projects.pp_mobileseg.backbones', 'projects.pp_mobileseg.decode_head'
+    ],
+    allow_failed_imports=False)
+checkpoint = 'https://download.openmmlab.com/mmsegmentation/v0.5/pp_mobileseg/pp_mobileseg_mobilenetv3_3rdparty-tiny-e4b35e96.pth'  # noqa
+crop_size = (512, 512)
+data_preprocessor = dict(size=crop_size, test_cfg=dict(size_divisor=32))
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+model = dict(
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        init_cfg=dict(type='Pretrained', checkpoint=checkpoint),
+        type='StrideFormer',
+        mobileV3_cfg=[
+            # k t c, s
+            [[3, 16, 16, True, 'ReLU', 1], [3, 64, 32, False, 'ReLU', 2],
+             [3, 48, 24, False, 'ReLU', 1]],  # cfg1
+            [[5, 96, 32, True, 'HSwish', 2], [5, 96, 32, True, 'HSwish',
+                                              1]],  # cfg2
+            [[5, 160, 64, True, 'HSwish', 2], [5, 160, 64, True, 'HSwish',
+                                               1]],  # cfg3
+            [[3, 384, 128, True, 'HSwish', 2],
+             [3, 384, 128, True, 'HSwish', 1]],  # cfg4
+        ],
+        channels=[16, 24, 32, 64, 128],
+        depths=[2, 2],
+        embed_dims=[64, 128],
+        num_heads=4,
+        inj_type='AAM',
+        out_feat_chs=[32, 64, 128],
+        act_cfg=dict(type='ReLU6'),
+    ),
+    decode_head=dict(
+        num_classes=150,
+        in_channels=256,
+        use_dw=True,
+        act_cfg=dict(type='ReLU'),
+        upsample='intepolate'),
+)
diff --git a/projects/pp_mobileseg/decode_head/__init__.py b/projects/pp_mobileseg/decode_head/__init__.py
new file mode 100644
index 0000000000..6f71b784e1
--- /dev/null
+++ b/projects/pp_mobileseg/decode_head/__init__.py
@@ -0,0 +1,6 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .pp_mobileseg_head import PPMobileSegHead
+
+__all__ = [
+    'PPMobileSegHead',
+]
diff --git a/projects/pp_mobileseg/decode_head/pp_mobileseg_head.py b/projects/pp_mobileseg/decode_head/pp_mobileseg_head.py
new file mode 100644
index 0000000000..243f026372
--- /dev/null
+++ b/projects/pp_mobileseg/decode_head/pp_mobileseg_head.py
@@ -0,0 +1,94 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from mmcv.cnn import ConvModule, build_conv_layer
+from torch import Tensor
+
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module()
+class PPMobileSegHead(nn.Module):
+    """the segmentation head.
+
+    Args:
+        num_classes (int): the classes num.
+        in_channels (int): the input channels.
+        use_dw (bool): if to use deepwith convolution.
+        dropout_ratio (float): Probability of an element to be zeroed.
+            Default 0.0。
+        align_corners (bool, optional): Geometrically, we consider the pixels
+            of the input and output as squares rather than points.
+        upsample (str): the upsample method.
+        out_channels (int): the output channel.
+        conv_cfg (dict): Config dict for convolution layer.
+        act_cfg (dict): Config dict for activation layer.
+            Default: dict(type='ReLU').
+        norm_cfg (dict): Config dict for normalization layer.
+            Default: dict(type='BN').
+    """
+
+    def __init__(self,
+                 num_classes,
+                 in_channels,
+                 use_dw=True,
+                 dropout_ratio=0.1,
+                 align_corners=False,
+                 upsample='intepolate',
+                 out_channels=None,
+                 conv_cfg=dict(type='Conv'),
+                 act_cfg=dict(type='ReLU'),
+                 norm_cfg=dict(type='BN')):
+
+        super().__init__()
+        self.align_corners = align_corners
+        self.last_channels = in_channels
+        self.upsample = upsample
+        self.num_classes = num_classes
+        self.out_channels = out_channels
+        self.linear_fuse = ConvModule(
+            in_channels=self.last_channels,
+            out_channels=self.last_channels,
+            kernel_size=1,
+            bias=False,
+            groups=self.last_channels if use_dw else 1,
+            norm_cfg=norm_cfg,
+            act_cfg=act_cfg)
+        self.dropout = nn.Dropout2d(dropout_ratio)
+        self.conv_seg = build_conv_layer(
+            conv_cfg, self.last_channels, self.num_classes, kernel_size=1)
+
+    def forward(self, x):
+        x, x_hw = x[0], x[1]
+        x = self.linear_fuse(x)
+        x = self.dropout(x)
+        x = self.conv_seg(x)
+        if self.upsample == 'intepolate' or self.training or \
+                self.num_classes < 30:
+            x = F.interpolate(
+                x, x_hw, mode='bilinear', align_corners=self.align_corners)
+        elif self.upsample == 'vim':
+            labelset = torch.unique(torch.argmax(x, 1))
+            x = torch.gather(x, 1, labelset)
+            x = F.interpolate(
+                x, x_hw, mode='bilinear', align_corners=self.align_corners)
+
+            pred = torch.argmax(x, 1)
+            pred_retrieve = torch.zeros(pred.shape, dtype=torch.int32)
+            for i, val in enumerate(labelset):
+                pred_retrieve[pred == i] = labelset[i].cast('int32')
+
+            x = pred_retrieve
+        else:
+            raise NotImplementedError(self.upsample, ' is not implemented')
+
+        return [x]
+
+    def predict(self, inputs, batch_img_metas: List[dict], test_cfg,
+                **kwargs) -> List[Tensor]:
+        """Forward function for testing, only ``pam_cam`` is used."""
+        seg_logits = self.forward(inputs)[0]
+        return seg_logits
diff --git a/projects/pp_mobileseg/inference_onnx.py b/projects/pp_mobileseg/inference_onnx.py
new file mode 100644
index 0000000000..139d1b1324
--- /dev/null
+++ b/projects/pp_mobileseg/inference_onnx.py
@@ -0,0 +1,203 @@
+import argparse
+import time
+from typing import List, Tuple
+
+import cv2
+import loguru
+import numpy as np
+import onnxruntime as ort
+
+logger = loguru.logger
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='PP_Mobileseg ONNX inference demo.')
+    parser.add_argument('onnx_file', help='ONNX file path')
+    parser.add_argument('image_file', help='Input image file path')
+    parser.add_argument(
+        '--input-size',
+        type=int,
+        nargs='+',
+        default=[512, 512],
+        help='input image size')
+    parser.add_argument(
+        '--device', help='device type for inference', default='cpu')
+    parser.add_argument(
+        '--save-path',
+        help='path to save the output image',
+        default='output.jpg')
+    args = parser.parse_args()
+    return args
+
+
+def preprocess(
+    img: np.ndarray, input_size: Tuple[int, int] = (512, 512)
+) -> Tuple[np.ndarray, np.ndarray]:
+    """Preprocess image for inference."""
+    img_shape = img.shape[:2]
+    # Resize
+    resized_img = cv2.resize(img, input_size)
+
+    # Normalize
+    mean = np.array([123.575, 116.28, 103.53], dtype=np.float32)
+    std = np.array([58.395, 57.12, 57.375], dtype=np.float32)
+    resized_img = (resized_img - mean) / std
+
+    return resized_img, img_shape
+
+
+def build_session(onnx_file: str, device: str = 'cpu') -> ort.InferenceSession:
+    """Build onnxruntime session.
+
+    Args:
+        onnx_file (str): ONNX file path.
+        device (str): Device type for inference.
+
+    Returns:
+        sess (ort.InferenceSession): ONNXRuntime session.
+    """
+    providers = ['CPUExecutionProvider'
+                 ] if device == 'cpu' else ['CUDAExecutionProvider']
+    sess = ort.InferenceSession(path_or_bytes=onnx_file, providers=providers)
+
+    return sess
+
+
+def inference(sess: ort.InferenceSession, img: np.ndarray) -> np.ndarray:
+    """Inference RTMPose model.
+
+    Args:
+        sess (ort.InferenceSession): ONNXRuntime session.
+        img (np.ndarray): Input image in shape.
+
+    Returns:
+        outputs (np.ndarray): Output of RTMPose model.
+    """
+    # build input
+    input_img = [img.transpose(2, 0, 1).astype(np.float32)]
+
+    # build output
+    sess_input = {sess.get_inputs()[0].name: input_img}
+    sess_output = []
+    for out in sess.get_outputs():
+        sess_output.append(out.name)
+
+    # inference
+    outputs = sess.run(output_names=sess_output, input_feed=sess_input)
+
+    return outputs
+
+
+def postprocess(outputs: List[np.ndarray],
+                origin_shape: Tuple[int, int]) -> np.ndarray:
+    """Postprocess outputs of PP_Mobileseg model.
+
+    Args:
+        outputs (List[np.ndarray]): Outputs of PP_Mobileseg model.
+        origin_shape (Tuple[int, int]): Input size of PP_Mobileseg model.
+
+    Returns:
+        seg_map (np.ndarray): Segmentation map.
+    """
+    seg_map = outputs[0][0][0]
+    seg_map = cv2.resize(seg_map.astype(np.float32), origin_shape)
+    return seg_map
+
+
+def visualize(img: np.ndarray,
+              seg_map: np.ndarray,
+              filename: str = 'output.jpg',
+              opacity: float = 0.8) -> np.ndarray:
+    assert 0.0 <= opacity <= 1.0, 'opacity should be in range [0, 1]'
+    palette = np.array(PALETTE)
+    color_seg = np.zeros((seg_map.shape[0], seg_map.shape[1], 3),
+                         dtype=np.uint8)
+    for label, color in enumerate(palette):
+        color_seg[seg_map == label, :] = color
+    # convert to BGR
+    color_seg = color_seg[..., ::-1]
+
+    img = img * (1 - opacity) + color_seg * opacity
+    cv2.imwrite(filename, img)
+
+    return img
+
+
+def main():
+    args = parse_args()
+    logger.info('Start running model inference...')
+
+    # read image from file
+    logger.info(f'1. Read image from file {args.image_file}...')
+    img = cv2.imread(args.image_file)
+
+    # build onnx model
+    logger.info(f'2. Build onnx model from {args.onnx_file}...')
+    sess = build_session(args.onnx_file, args.device)
+
+    # preprocess
+    logger.info('3. Preprocess image...')
+    model_input_size = tuple(args.input_size)
+    assert len(model_input_size) == 2
+    resized_img, origin_shape = preprocess(img, model_input_size)
+
+    # inference
+    logger.info('4. Inference...')
+    start = time.time()
+    outputs = inference(sess, resized_img)
+    logger.info(f'Inference time: {time.time() - start:.4f}s')
+
+    # postprocess
+    logger.info('5. Postprocess...')
+    h, w = origin_shape
+    seg_map = postprocess(outputs, (w, h))
+
+    # visualize
+    logger.info('6. Visualize...')
+    visualize(img, seg_map, args.save_path)
+
+    logger.info('Done...')
+
+
+PALETTE = [[120, 120, 120], [180, 120, 120], [6, 230, 230], [80, 50, 50],
+           [4, 200, 3], [120, 120, 80], [140, 140, 140], [204, 5, 255],
+           [230, 230, 230], [4, 250, 7], [224, 5, 255], [235, 255, 7],
+           [150, 5, 61], [120, 120, 70], [8, 255, 51], [255, 6, 82],
+           [143, 255, 140], [204, 255, 4], [255, 51, 7], [204, 70, 3],
+           [0, 102, 200], [61, 230, 250], [255, 6, 51], [11, 102, 255],
+           [255, 7, 71], [255, 9, 224], [9, 7, 230], [220, 220, 220],
+           [255, 9, 92], [112, 9, 255], [8, 255, 214], [7, 255, 224],
+           [255, 184, 6], [10, 255, 71], [255, 41, 10], [7, 255, 255],
+           [224, 255, 8], [102, 8, 255], [255, 61, 6], [255, 194, 7],
+           [255, 122, 8], [0, 255, 20], [255, 8, 41], [255, 5, 153],
+           [6, 51, 255], [235, 12, 255], [160, 150, 20], [0, 163, 255],
+           [140, 140, 140], [250, 10, 15], [20, 255, 0], [31, 255, 0],
+           [255, 31, 0], [255, 224, 0], [153, 255, 0], [0, 0, 255],
+           [255, 71, 0], [0, 235, 255], [0, 173, 255], [31, 0, 255],
+           [11, 200, 200], [255, 82, 0], [0, 255, 245], [0, 61, 255],
+           [0, 255, 112], [0, 255, 133], [255, 0, 0], [255, 163, 0],
+           [255, 102, 0], [194, 255, 0], [0, 143, 255], [51, 255, 0],
+           [0, 82, 255], [0, 255, 41], [0, 255, 173], [10, 0, 255],
+           [173, 255, 0], [0, 255, 153], [255, 92, 0], [255, 0, 255],
+           [255, 0, 245], [255, 0, 102], [255, 173, 0], [255, 0, 20],
+           [255, 184, 184], [0, 31, 255], [0, 255, 61], [0, 71, 255],
+           [255, 0, 204], [0, 255, 194], [0, 255, 82], [0, 10, 255],
+           [0, 112, 255], [51, 0, 255], [0, 194, 255], [0, 122, 255],
+           [0, 255, 163], [255, 153, 0], [0, 255, 10], [255, 112, 0],
+           [143, 255, 0], [82, 0, 255], [163, 255, 0], [255, 235, 0],
+           [8, 184, 170], [133, 0, 255], [0, 255, 92], [184, 0, 255],
+           [255, 0, 31], [0, 184, 255], [0, 214, 255], [255, 0, 112],
+           [92, 255, 0], [0, 224, 255], [112, 224, 255], [70, 184, 160],
+           [163, 0, 255], [153, 0, 255], [71, 255, 0], [255, 0, 163],
+           [255, 204, 0], [255, 0, 143], [0, 255, 235], [133, 255, 0],
+           [255, 0, 235], [245, 0, 255], [255, 0, 122], [255, 245, 0],
+           [10, 190, 212], [214, 255, 0], [0, 204, 255], [20, 0, 255],
+           [255, 255, 0], [0, 153, 255], [0, 41, 255], [0, 255, 204],
+           [41, 0, 255], [41, 255, 0], [173, 0, 255], [0, 245, 255],
+           [71, 0, 255], [122, 0, 255], [0, 255, 184], [0, 92, 255],
+           [184, 255, 0], [0, 133, 255], [255, 214, 0], [25, 194, 194],
+           [102, 255, 0], [92, 0, 255]]
+
+if __name__ == '__main__':
+    main()
diff --git a/projects/sam_inference_demo/README.md b/projects/sam_inference_demo/README.md
new file mode 100644
index 0000000000..f8077b8729
--- /dev/null
+++ b/projects/sam_inference_demo/README.md
@@ -0,0 +1,40 @@
+# Introducing the Segment Anything Model (SAM) Inference Demo!
+
+Welcome to the Segment Anything (SA) Inference Demo, a user-friendly implementation based on the original Segment Anything project. Our demo allows you to experience the power and versatility of the Segment Anything Model (SAM) through an easy-to-use API.
+
+With this inference demo, you can explore the capabilities of the Segment Anything Model and witness its effectiveness in various tasks and image distributions. For more information on the original project, dataset, and model, please visit the official website at https://segment-anything.com.
+
+### Prerequisites
+
+- Python 3.10
+- PyTorch 1.13
+- MMEngine >= v0.7.2
+- MMCV >= v2.0.0
+
+### Installation
+
+We assume that you have already installed PyTorch. If not, please follow the instructions on the [PyTorch website](https://pytorch.org/).
+
+**1. Install MMEngine & MMCV**
+
+```shell
+pip install openmim
+mim install mmengine
+mim install 'mmcv>=2.0.0'
+```
+
+**2. Install MMPretrain**
+
+```shell
+pip install git+https://github.com/open-mmlab/mmpretrain.git@dev
+```
+
+**3. Install MMSegmentation**
+
+```shell
+pip install mmsegmentation
+```
+
+### Usage
+
+Open the `sam_image_demo.ipynb` notebook and follow the instructions to run the demo.
diff --git a/projects/sam_inference_demo/sam/__init__.py b/projects/sam_inference_demo/sam/__init__.py
new file mode 100644
index 0000000000..82b6b78469
--- /dev/null
+++ b/projects/sam_inference_demo/sam/__init__.py
@@ -0,0 +1,2 @@
+from .modeling import *  # noqa
+from .utils import *  # noqa
diff --git a/projects/sam_inference_demo/sam/modeling/__init__.py b/projects/sam_inference_demo/sam/modeling/__init__.py
new file mode 100644
index 0000000000..9892a6b085
--- /dev/null
+++ b/projects/sam_inference_demo/sam/modeling/__init__.py
@@ -0,0 +1,12 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+from .sam import SAM
+from .transformer import TwoWayTransformer
+
+__all__ = ['SAM', 'MaskDecoder', 'PromptEncoder', 'TwoWayTransformer']
diff --git a/projects/sam_inference_demo/sam/modeling/common.py b/projects/sam_inference_demo/sam/modeling/common.py
new file mode 100644
index 0000000000..d289276112
--- /dev/null
+++ b/projects/sam_inference_demo/sam/modeling/common.py
@@ -0,0 +1,45 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from typing import Type
+
+import torch
+import torch.nn as nn
+
+
+class MLPBlock(nn.Module):
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        mlp_dim: int,
+        act: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        super().__init__()
+        self.lin1 = nn.Linear(embedding_dim, mlp_dim)
+        self.lin2 = nn.Linear(mlp_dim, embedding_dim)
+        self.act = act()
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        return self.lin2(self.act(self.lin1(x)))
+
+
+# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa
+# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119  # noqa
+class LayerNorm2d(nn.Module):
+
+    def __init__(self, num_channels: int, eps: float = 1e-6) -> None:
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(num_channels))
+        self.bias = nn.Parameter(torch.zeros(num_channels))
+        self.eps = eps
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        u = x.mean(1, keepdim=True)
+        s = (x - u).pow(2).mean(1, keepdim=True)
+        x = (x - u) / torch.sqrt(s + self.eps)
+        x = self.weight[:, None, None] * x + self.bias[:, None, None]
+        return x
diff --git a/projects/sam_inference_demo/sam/modeling/mask_decoder.py b/projects/sam_inference_demo/sam/modeling/mask_decoder.py
new file mode 100644
index 0000000000..9ad616b589
--- /dev/null
+++ b/projects/sam_inference_demo/sam/modeling/mask_decoder.py
@@ -0,0 +1,196 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Borrowed from https://github.com/facebookresearch/segment-anything
+
+from typing import List, Tuple
+
+import torch
+from torch import Tensor, nn
+from torch.nn import functional as F
+
+from mmseg.registry import MODELS
+from .common import LayerNorm2d
+
+
+@MODELS.register_module()
+class MaskDecoder(nn.Module):
+
+    def __init__(
+        self,
+        *,
+        transformer_dim: int,
+        transformer: dict,
+        num_multimask_outputs: int = 3,
+        act_cfg: dict = dict(type='GELU'),
+        iou_head_depth: int = 3,
+        iou_head_hidden_dim: int = 256,
+    ) -> None:
+        """Predicts masks given an image and prompt embeddings, using a
+        tranformer architecture.
+
+        Borrowed from https://github.com/facebookresearch/segment-anything
+
+        Arguments:
+          transformer_dim (int): the channel dimension of the transformer
+          transformer (nn.Module): the transformer used to predict masks
+          num_multimask_outputs (int): the number of masks to predict
+            when disambiguating masks
+          activation (nn.Module): the type of activation to use when
+            upscaling masks
+          iou_head_depth (int): the depth of the MLP used to predict
+            mask quality
+          iou_head_hidden_dim (int): the hidden dimension of the MLP
+            used to predict mask quality
+        """
+        super().__init__()
+        self.transformer_dim = transformer_dim
+        self.transformer = MODELS.build(transformer)
+
+        self.num_multimask_outputs = num_multimask_outputs
+
+        self.iou_token = nn.Embedding(1, transformer_dim)
+        self.num_mask_tokens = num_multimask_outputs + 1
+        self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim)
+
+        activation = MODELS.build(act_cfg)
+        self.output_upscaling = nn.Sequential(
+            nn.ConvTranspose2d(
+                transformer_dim, transformer_dim // 4, kernel_size=2,
+                stride=2),
+            LayerNorm2d(transformer_dim // 4),
+            activation,
+            nn.ConvTranspose2d(
+                transformer_dim // 4,
+                transformer_dim // 8,
+                kernel_size=2,
+                stride=2),
+            activation,
+        )
+        self.output_hypernetworks_mlps = nn.ModuleList([
+            MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3)
+            for i in range(self.num_mask_tokens)
+        ])
+
+        self.iou_prediction_head = MLP(transformer_dim, iou_head_hidden_dim,
+                                       self.num_mask_tokens, iou_head_depth)
+
+    def forward(
+        self,
+        image_embeddings: Tensor,
+        image_pe: Tensor,
+        sparse_prompt_embeddings: Tensor,
+        dense_prompt_embeddings: Tensor,
+        multimask_output: bool,
+    ) -> Tuple[Tensor, Tensor]:
+        """Predict masks given image and prompt embeddings.
+
+        Borrowed from https://github.com/facebookresearch/segment-anything
+
+        Arguments:
+          image_embeddings (Tensor): the embeddings from the image encoder
+          image_pe (Tensor): positional encoding with the shape of
+            image_embeddings
+          sparse_prompt_embeddings (Tensor): the embeddings of
+            the points and boxes
+          dense_prompt_embeddings (Tensor): the embeddings of the mask inputs
+          multimask_output (bool): Whether to return multiple masks or a single
+            mask.
+
+        Returns:
+          Tensor: batched predicted masks
+          Tensor: batched predictions of mask quality
+        """
+        masks, iou_pred = self.predict_masks(
+            image_embeddings=image_embeddings,
+            image_pe=image_pe,
+            sparse_prompt_embeddings=sparse_prompt_embeddings,
+            dense_prompt_embeddings=dense_prompt_embeddings,
+        )
+
+        # Select the correct mask or masks for output
+        if multimask_output:
+            mask_slice = slice(1, None)
+        else:
+            mask_slice = slice(0, 1)
+        masks = masks[:, mask_slice, :, :]
+        iou_pred = iou_pred[:, mask_slice]
+
+        # Prepare output
+        return masks, iou_pred
+
+    def predict_masks(
+        self,
+        image_embeddings: Tensor,
+        image_pe: Tensor,
+        sparse_prompt_embeddings: Tensor,
+        dense_prompt_embeddings: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """Predicts masks.
+
+        See 'forward' for more details.
+        """
+        # Concatenate output tokens
+        output_tokens = torch.cat(
+            [self.iou_token.weight, self.mask_tokens.weight], dim=0)
+        output_tokens = output_tokens.unsqueeze(0).expand(
+            sparse_prompt_embeddings.size(0), -1, -1)
+        tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1)
+
+        # Expand per-image data in batch direction to be per-mask
+        src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0)
+        src = src + dense_prompt_embeddings
+        pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0)
+        b, c, h, w = src.shape
+
+        # Run the transformer
+        hs, src = self.transformer(src, pos_src, tokens)
+        iou_token_out = hs[:, 0, :]
+        mask_tokens_out = hs[:, 1:(1 + self.num_mask_tokens), :]
+
+        # Upscale mask embeddings and predict masks using the mask tokens
+        src = src.transpose(1, 2).view(b, c, h, w)
+        upscaled_embedding = self.output_upscaling(src)
+        hyper_in_list: List[Tensor] = []
+        for i in range(self.num_mask_tokens):
+            hyper_in_list.append(self.output_hypernetworks_mlps[i](
+                mask_tokens_out[:, i, :]))
+        hyper_in = torch.stack(hyper_in_list, dim=1)
+        b, c, h, w = upscaled_embedding.shape
+        masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(
+            b, -1, h, w)
+
+        # Generate mask quality predictions
+        iou_pred = self.iou_prediction_head(iou_token_out)
+
+        return masks, iou_pred
+
+
+# Lightly adapted from
+# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa
+class MLP(nn.Module):
+
+    def __init__(
+        self,
+        input_dim: int,
+        hidden_dim: int,
+        output_dim: int,
+        num_layers: int,
+        sigmoid_output: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(
+            nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+        self.sigmoid_output = sigmoid_output
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        if self.sigmoid_output:
+            x = F.sigmoid(x)
+        return x
diff --git a/projects/sam_inference_demo/sam/modeling/prompt_encoder.py b/projects/sam_inference_demo/sam/modeling/prompt_encoder.py
new file mode 100644
index 0000000000..6b7c083387
--- /dev/null
+++ b/projects/sam_inference_demo/sam/modeling/prompt_encoder.py
@@ -0,0 +1,227 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Borrowed from https://github.com/facebookresearch/segment-anything
+
+from typing import Any, Optional, Tuple, Type
+
+import numpy as np
+import torch
+from torch import nn
+
+from mmseg.registry import MODELS
+from .common import LayerNorm2d
+
+
+@MODELS.register_module()
+class PromptEncoder(nn.Module):
+
+    def __init__(
+        self,
+        embed_dim: int,
+        image_embedding_size: Tuple[int, int],
+        input_image_size: Tuple[int, int],
+        mask_in_chans: int,
+        activation: Type[nn.Module] = nn.GELU,
+    ) -> None:
+        """Encodes prompts for input to SAM's mask decoder.
+
+        Arguments:
+          embed_dim (int): The prompts' embedding dimension
+          image_embedding_size (tuple(int, int)): The spatial size of the
+            image embedding, as (H, W).
+          input_image_size (int): The padded size of the image as input
+            to the image encoder, as (H, W).
+          mask_in_chans (int): The number of hidden channels used for
+            encoding input masks.
+          activation (nn.Module): The activation to use when encoding
+            input masks.
+        """
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.input_image_size = input_image_size
+        self.image_embedding_size = image_embedding_size
+        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)
+
+        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
+        point_embeddings = [
+            nn.Embedding(1, embed_dim)
+            for i in range(self.num_point_embeddings)
+        ]
+        self.point_embeddings = nn.ModuleList(point_embeddings)
+        self.not_a_point_embed = nn.Embedding(1, embed_dim)
+
+        self.mask_input_size = (4 * image_embedding_size[0],
+                                4 * image_embedding_size[1])
+        self.mask_downscaling = nn.Sequential(
+            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans // 4),
+            activation(),
+            nn.Conv2d(
+                mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
+            LayerNorm2d(mask_in_chans),
+            activation(),
+            nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
+        )
+        self.no_mask_embed = nn.Embedding(1, embed_dim)
+
+    def get_dense_pe(self) -> torch.Tensor:
+        """Returns the positional encoding used to encode point prompts,
+        applied to a dense set of points the shape of the image encoding.
+
+        Returns:
+          torch.Tensor: Positional encoding with shape
+            1x(embed_dim)x(embedding_h)x(embedding_w)
+        """
+        return self.pe_layer(self.image_embedding_size).unsqueeze(0)
+
+    def _embed_points(
+        self,
+        points: torch.Tensor,
+        labels: torch.Tensor,
+        pad: bool,
+    ) -> torch.Tensor:
+        """Embeds point prompts."""
+        points = points + 0.5  # Shift to center of pixel
+        if pad:
+            padding_point = torch.zeros((points.shape[0], 1, 2),
+                                        device=points.device)
+            padding_label = -torch.ones(
+                (labels.shape[0], 1), device=labels.device)
+            points = torch.cat([points, padding_point], dim=1)
+            labels = torch.cat([labels, padding_label], dim=1)
+        point_embedding = self.pe_layer.forward_with_coords(
+            points, self.input_image_size)
+        point_embedding[labels == -1] = 0.0
+        point_embedding[labels == -1] += self.not_a_point_embed.weight
+        point_embedding[labels == 0] += self.point_embeddings[0].weight
+        point_embedding[labels == 1] += self.point_embeddings[1].weight
+        return point_embedding
+
+    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
+        """Embeds box prompts."""
+        boxes = boxes + 0.5  # Shift to center of pixel
+        coords = boxes.reshape(-1, 2, 2)
+        corner_embedding = self.pe_layer.forward_with_coords(
+            coords, self.input_image_size)
+        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
+        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
+        return corner_embedding
+
+    def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
+        """Embeds mask inputs."""
+        mask_embedding = self.mask_downscaling(masks)
+        return mask_embedding
+
+    def _get_batch_size(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> int:
+        """Gets the batch size of the output given the batch size of the input
+        prompts."""
+        if points is not None:
+            return points[0].shape[0]
+        elif boxes is not None:
+            return boxes.shape[0]
+        elif masks is not None:
+            return masks.shape[0]
+        else:
+            return 1
+
+    def _get_device(self) -> torch.device:
+        return self.point_embeddings[0].weight.device
+
+    def forward(
+        self,
+        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
+        boxes: Optional[torch.Tensor],
+        masks: Optional[torch.Tensor],
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Embeds different types of prompts, returning both sparse and dense
+        embeddings.
+
+        Arguments:
+          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
+            and labels to embed.
+          boxes (torch.Tensor or none): boxes to embed
+          masks (torch.Tensor or none): masks to embed
+
+        Returns:
+          torch.Tensor: sparse embeddings for the points and boxes, with shape
+            BxNx(embed_dim), where N is determined by the number of input points
+            and boxes.
+          torch.Tensor: dense embeddings for the masks, in the shape
+            Bx(embed_dim)x(embed_H)x(embed_W)
+        """ # noqa
+        bs = self._get_batch_size(points, boxes, masks)
+        sparse_embeddings = torch.empty((bs, 0, self.embed_dim),
+                                        device=self._get_device())
+        if points is not None:
+            coords, labels = points
+            point_embeddings = self._embed_points(
+                coords, labels, pad=(boxes is None))
+            sparse_embeddings = torch.cat(
+                [sparse_embeddings, point_embeddings], dim=1)
+        if boxes is not None:
+            box_embeddings = self._embed_boxes(boxes)
+            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings],
+                                          dim=1)
+
+        if masks is not None:
+            dense_embeddings = self._embed_masks(masks)
+        else:
+            dense_embeddings = self.no_mask_embed.weight.reshape(
+                1, -1, 1, 1).expand(bs, -1, self.image_embedding_size[0],
+                                    self.image_embedding_size[1])
+
+        return sparse_embeddings, dense_embeddings
+
+
+class PositionEmbeddingRandom(nn.Module):
+    """Positional encoding using random spatial frequencies."""
+
+    def __init__(self,
+                 num_pos_feats: int = 64,
+                 scale: Optional[float] = None) -> None:
+        super().__init__()
+        if scale is None or scale <= 0.0:
+            scale = 1.0
+        self.register_buffer(
+            'positional_encoding_gaussian_matrix',
+            scale * torch.randn((2, num_pos_feats)),
+        )
+
+    def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor:
+        """Positionally encode points that are normalized to [0,1]."""
+        # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape # noqa
+        coords = 2 * coords - 1
+        coords = coords @ self.positional_encoding_gaussian_matrix
+        coords = 2 * np.pi * coords
+        # outputs d_1 x ... x d_n x C shape
+        return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1)
+
+    def forward(self, size: Tuple[int, int]) -> torch.Tensor:
+        """Generate positional encoding for a grid of the specified size."""
+        h, w = size
+        device: Any = self.positional_encoding_gaussian_matrix.device
+        grid = torch.ones((h, w), device=device, dtype=torch.float32)
+        y_embed = grid.cumsum(dim=0) - 0.5
+        x_embed = grid.cumsum(dim=1) - 0.5
+        y_embed = y_embed / h
+        x_embed = x_embed / w
+
+        pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1))
+        return pe.permute(2, 0, 1)  # C x H x W
+
+    def forward_with_coords(self, coords_input: torch.Tensor,
+                            image_size: Tuple[int, int]) -> torch.Tensor:
+        """Positionally encode points that are not normalized to [0,1]."""
+        coords = coords_input.clone()
+        coords[:, :, 0] = coords[:, :, 0] / image_size[1]
+        coords[:, :, 1] = coords[:, :, 1] / image_size[0]
+        return self._pe_encoding(coords.to(torch.float))  # B x N x C
diff --git a/projects/sam_inference_demo/sam/modeling/sam.py b/projects/sam_inference_demo/sam/modeling/sam.py
new file mode 100644
index 0000000000..c61c1eca4e
--- /dev/null
+++ b/projects/sam_inference_demo/sam/modeling/sam.py
@@ -0,0 +1,188 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# Borrowed from https://github.com/facebookresearch/segment-anything
+
+from typing import Any, Dict, List, Tuple
+
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from mmseg.registry import MODELS
+from .mask_decoder import MaskDecoder
+from .prompt_encoder import PromptEncoder
+
+
+@MODELS.register_module()
+class SAM(nn.Module):
+    mask_threshold: float = 0.0
+    image_format: str = 'RGB'
+
+    def __init__(
+        self,
+        image_encoder_cfg: dict,
+        prompt_encoder_cfg: dict,
+        mask_decoder_cfg: dict,
+        pixel_mean: List[float] = [123.675, 116.28, 103.53],
+        pixel_std: List[float] = [58.395, 57.12, 57.375],
+    ) -> None:
+        """SAM predicts object masks from an image and input prompts. Borrowed
+        from https://github.com/facebookresearch/segment-anything.
+
+        Arguments:
+          image_encoder (ViTSAM): The backbone used to encode the
+            image into image embeddings that allow for efficient mask
+            prediction.
+          prompt_encoder (PromptEncoder): Encodes various types of input
+            prompts.
+          mask_decoder (MaskDecoder): Predicts masks from the image embeddings
+            and encoded prompts.
+          pixel_mean (list(float)): Mean values for normalizing pixels in the
+            input image.
+          pixel_std (list(float)): Std values for normalizing pixels in the
+            input image.
+        """
+        super().__init__()
+        self.image_encoder = MODELS.build(image_encoder_cfg)
+        self.prompt_encoder: PromptEncoder = MODELS.build(prompt_encoder_cfg)
+        self.mask_decoder: MaskDecoder = MODELS.build(mask_decoder_cfg)
+        self.register_buffer('pixel_mean',
+                             torch.Tensor(pixel_mean).view(-1, 1, 1), False)
+        self.register_buffer('pixel_std',
+                             torch.Tensor(pixel_std).view(-1, 1, 1), False)
+
+    @property
+    def device(self) -> Any:
+        return self.pixel_mean.device
+
+    @torch.no_grad()
+    def forward(
+        self,
+        batched_input: List[Dict[str, Any]],
+        multimask_output: bool,
+    ) -> List[Dict[str, torch.Tensor]]:
+        """Predicts masks end-to-end from provided images and prompts. If
+        prompts are not known in advance, using SamPredictor is recommended
+        over calling the model directly.
+
+        Borrowed from https://github.com/facebookresearch/segment-anything
+
+        Arguments:
+          batched_input (list(dict)): A list over input images, each a
+            dictionary with the following keys. A prompt key can be
+            excluded if it is not present.
+              'image': The image as a torch tensor in 3xHxW format,
+                already transformed for input to the model.
+              'original_size': (tuple(int, int)) The original size of
+                the image before transformation, as (H, W).
+              'point_coords': (torch.Tensor) Batched point prompts for
+                this image, with shape BxNx2. Already transformed to the
+                input frame of the model.
+              'point_labels': (torch.Tensor) Batched labels for point prompts,
+                with shape BxN.
+              'boxes': (torch.Tensor) Batched box inputs, with shape Bx4.
+                Already transformed to the input frame of the model.
+              'mask_inputs': (torch.Tensor) Batched mask inputs to the model,
+                in the form Bx1xHxW.
+          multimask_output (bool): Whether the model should predict multiple
+            disambiguating masks, or return a single mask.
+
+        Returns:
+          (list(dict)): A list over input images, where each element is
+            as dictionary with the following keys.
+              'masks': (torch.Tensor) Batched binary mask predictions,
+                with shape BxCxHxW, where B is the number of input prompts,
+                C is determiend by multimask_output, and (H, W) is the
+                original size of the image.
+              'iou_predictions': (torch.Tensor) The model's predictions
+                of mask quality, in shape BxC.
+              'low_res_logits': (torch.Tensor) Low resolution logits with
+                shape BxCxHxW, where H=W=256. Can be passed as mask input
+                to subsequent iterations of prediction.
+        """
+        input_images = torch.stack(
+            [self.preprocess(x['image']) for x in batched_input], dim=0)
+        image_embeddings = self.image_encoder(input_images)
+
+        outputs = []
+        for image_record, curr_embedding in zip(batched_input,
+                                                image_embeddings):
+            if 'point_coords' in image_record:
+                points = (image_record['point_coords'],
+                          image_record['point_labels'])
+            else:
+                points = None
+            sparse_embeddings, dense_embeddings = self.prompt_encoder(
+                points=points,
+                boxes=image_record.get('boxes', None),
+                masks=image_record.get('mask_inputs', None),
+            )
+            low_res_masks, iou_predictions = self.mask_decoder(
+                image_embeddings=curr_embedding.unsqueeze(0),
+                image_pe=self.prompt_encoder.get_dense_pe(),
+                sparse_prompt_embeddings=sparse_embeddings,
+                dense_prompt_embeddings=dense_embeddings,
+                multimask_output=multimask_output,
+            )
+            masks = self.postprocess_masks(
+                low_res_masks,
+                input_size=image_record['image'].shape[-2:],
+                original_size=image_record['original_size'],
+            )
+            masks = masks > self.mask_threshold
+            outputs.append({
+                'masks': masks,
+                'iou_predictions': iou_predictions,
+                'low_res_logits': low_res_masks,
+            })
+        return outputs
+
+    def postprocess_masks(
+        self,
+        masks: torch.Tensor,
+        input_size: Tuple[int, ...],
+        original_size: Tuple[int, ...],
+    ) -> torch.Tensor:
+        """Remove padding and upscale masks to the original image size.
+
+        Borrowed from https://github.com/facebookresearch/segment-anything
+
+        Arguments:
+          masks (torch.Tensor): Batched masks from the mask_decoder,
+            in BxCxHxW format.
+          input_size (tuple(int, int)): The size of the image input to the
+            model, in (H, W) format. Used to remove padding.
+          original_size (tuple(int, int)): The original size of the image
+            before resizing for input to the model, in (H, W) format.
+
+        Returns:
+          (torch.Tensor): Batched masks in BxCxHxW format, where (H, W)
+            is given by original_size.
+        """
+        masks = F.interpolate(
+            masks,
+            self.image_encoder.img_size,
+            mode='bilinear',
+            align_corners=False,
+        )
+        masks = masks[..., :input_size[0], :input_size[1]]
+        masks = F.interpolate(
+            masks, original_size, mode='bilinear', align_corners=False)
+        return masks
+
+    def preprocess(self, x: torch.Tensor) -> torch.Tensor:
+        """Normalize pixel values and pad to a square input."""
+        # Normalize colors
+        x = (x - self.pixel_mean) / self.pixel_std
+
+        # Pad
+        h, w = x.shape[-2:]
+        img_size = max(self.image_encoder.img_size)
+        padh = img_size - h
+        padw = img_size - w
+        x = F.pad(x, (0, padw, 0, padh))
+        return x
diff --git a/projects/sam_inference_demo/sam/modeling/transformer.py b/projects/sam_inference_demo/sam/modeling/transformer.py
new file mode 100644
index 0000000000..c56f602487
--- /dev/null
+++ b/projects/sam_inference_demo/sam/modeling/transformer.py
@@ -0,0 +1,241 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+import math
+from typing import Tuple, Type
+
+import torch
+from torch import Tensor, nn
+
+from mmseg.registry import MODELS
+from .common import MLPBlock
+
+
+@MODELS.register_module()
+class TwoWayTransformer(nn.Module):
+
+    def __init__(
+        self,
+        depth: int,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+    ) -> None:
+        """A transformer decoder that attends to an input image using queries
+        whose positional embedding is supplied.
+
+        Args:
+          depth (int): number of layers in the transformer
+          embedding_dim (int): the channel dimension for the input embeddings
+          num_heads (int): the number of heads for multihead attention. Must
+            divide embedding_dim
+          mlp_dim (int): the channel dimension internal to the MLP block
+          activation (nn.Module): the activation to use in the MLP block
+        """
+        super().__init__()
+        self.depth = depth
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.mlp_dim = mlp_dim
+        self.layers = nn.ModuleList()
+
+        for i in range(depth):
+            self.layers.append(
+                TwoWayAttentionBlock(
+                    embedding_dim=embedding_dim,
+                    num_heads=num_heads,
+                    mlp_dim=mlp_dim,
+                    activation=activation,
+                    attention_downsample_rate=attention_downsample_rate,
+                    skip_first_layer_pe=(i == 0),
+                ))
+
+        self.final_attn_token_to_image = Attention(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate)
+        self.norm_final_attn = nn.LayerNorm(embedding_dim)
+
+    def forward(
+        self,
+        image_embedding: Tensor,
+        image_pe: Tensor,
+        point_embedding: Tensor,
+    ) -> Tuple[Tensor, Tensor]:
+        """
+        Args:
+          image_embedding (torch.Tensor): image to attend to. Should be shape
+            B x embedding_dim x h x w for any h and w.
+          image_pe (torch.Tensor): the positional encoding to add to the image. Must
+            have the same shape as image_embedding.
+          point_embedding (torch.Tensor): the embedding to add to the query points.
+            Must have shape B x N_points x embedding_dim for any N_points.
+
+        Returns:
+          torch.Tensor: the processed point_embedding
+          torch.Tensor: the processed image_embedding
+        """ # noqa E501
+        # BxCxHxW -> BxHWxC == B x N_image_tokens x C
+        bs, c, h, w = image_embedding.shape
+        image_embedding = image_embedding.flatten(2).permute(0, 2, 1)
+        image_pe = image_pe.flatten(2).permute(0, 2, 1)
+
+        # Prepare queries
+        queries = point_embedding
+        keys = image_embedding
+
+        # Apply transformer blocks and final layernorm
+        for layer in self.layers:
+            queries, keys = layer(
+                queries=queries,
+                keys=keys,
+                query_pe=point_embedding,
+                key_pe=image_pe,
+            )
+
+        # Apply the final attenion layer from the points to the image
+        q = queries + point_embedding
+        k = keys + image_pe
+        attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm_final_attn(queries)
+
+        return queries, keys
+
+
+class TwoWayAttentionBlock(nn.Module):
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        mlp_dim: int = 2048,
+        activation: Type[nn.Module] = nn.ReLU,
+        attention_downsample_rate: int = 2,
+        skip_first_layer_pe: bool = False,
+    ) -> None:
+        """A transformer block with four layers: (1) self-attention of sparse
+        inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp
+        block on sparse inputs, and (4) cross attention of dense inputs to
+        sparse inputs.
+
+        Arguments:
+          embedding_dim (int): the channel dimension of the embeddings
+          num_heads (int): the number of heads in the attention layers
+          mlp_dim (int): the hidden dimension of the mlp block
+          activation (nn.Module): the activation of the mlp block
+          skip_first_layer_pe (bool): skip the PE on the first layer
+        """
+        super().__init__()
+        self.self_attn = Attention(embedding_dim, num_heads)
+        self.norm1 = nn.LayerNorm(embedding_dim)
+
+        self.cross_attn_token_to_image = Attention(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate)
+        self.norm2 = nn.LayerNorm(embedding_dim)
+
+        self.mlp = MLPBlock(embedding_dim, mlp_dim, activation)
+        self.norm3 = nn.LayerNorm(embedding_dim)
+
+        self.norm4 = nn.LayerNorm(embedding_dim)
+        self.cross_attn_image_to_token = Attention(
+            embedding_dim,
+            num_heads,
+            downsample_rate=attention_downsample_rate)
+
+        self.skip_first_layer_pe = skip_first_layer_pe
+
+    def forward(self, queries: Tensor, keys: Tensor, query_pe: Tensor,
+                key_pe: Tensor) -> Tuple[Tensor, Tensor]:
+        # Self attention block
+        if self.skip_first_layer_pe:
+            queries = self.self_attn(q=queries, k=queries, v=queries)
+        else:
+            q = queries + query_pe
+            attn_out = self.self_attn(q=q, k=q, v=queries)
+            queries = queries + attn_out
+        queries = self.norm1(queries)
+
+        # Cross attention block, tokens attending to image embedding
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys)
+        queries = queries + attn_out
+        queries = self.norm2(queries)
+
+        # MLP block
+        mlp_out = self.mlp(queries)
+        queries = queries + mlp_out
+        queries = self.norm3(queries)
+
+        # Cross attention block, image embedding attending to tokens
+        q = queries + query_pe
+        k = keys + key_pe
+        attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries)
+        keys = keys + attn_out
+        keys = self.norm4(keys)
+
+        return queries, keys
+
+
+class Attention(nn.Module):
+    """An attention layer that allows for downscaling the size of the embedding
+    after projection to queries, keys, and values."""
+
+    def __init__(
+        self,
+        embedding_dim: int,
+        num_heads: int,
+        downsample_rate: int = 1,
+    ) -> None:
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.internal_dim = embedding_dim // downsample_rate
+        self.num_heads = num_heads
+        assert self.internal_dim % num_heads == 0, 'num_heads must divide embedding_dim.'  # noqa E501
+
+        self.q_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.k_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.v_proj = nn.Linear(embedding_dim, self.internal_dim)
+        self.out_proj = nn.Linear(self.internal_dim, embedding_dim)
+
+    def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor:
+        b, n, c = x.shape
+        x = x.reshape(b, n, num_heads, c // num_heads)
+        return x.transpose(1, 2)  # B x N_heads x N_tokens x C_per_head
+
+    def _recombine_heads(self, x: Tensor) -> Tensor:
+        b, n_heads, n_tokens, c_per_head = x.shape
+        x = x.transpose(1, 2)
+        return x.reshape(b, n_tokens, n_heads * c_per_head)  # B x N_tokens x C
+
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor:
+        # Input projections
+        q = self.q_proj(q)
+        k = self.k_proj(k)
+        v = self.v_proj(v)
+
+        # Separate into heads
+        q = self._separate_heads(q, self.num_heads)
+        k = self._separate_heads(k, self.num_heads)
+        v = self._separate_heads(v, self.num_heads)
+
+        # Attention
+        _, _, _, c_per_head = q.shape
+        attn = q @ k.permute(0, 1, 3, 2)  # B x N_heads x N_tokens x N_tokens
+        attn = attn / math.sqrt(c_per_head)
+        attn = torch.softmax(attn, dim=-1)
+
+        # Get output
+        out = attn @ v
+        out = self._recombine_heads(out)
+        out = self.out_proj(out)
+
+        return out
diff --git a/projects/sam_inference_demo/sam/sam_inferencer.py b/projects/sam_inference_demo/sam/sam_inferencer.py
new file mode 100644
index 0000000000..2da2e959c0
--- /dev/null
+++ b/projects/sam_inference_demo/sam/sam_inferencer.py
@@ -0,0 +1,688 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from typing import Any, Dict, List, Optional, Tuple
+
+import numpy as np
+import torch
+from mmengine.runner.checkpoint import load_checkpoint
+# yapf: disable
+from sam.utils import (MaskData, area_from_rle, batch_iterator,
+                       batched_mask_to_box, box_xyxy_to_xywh,
+                       build_all_layer_point_grids, calculate_stability_score,
+                       coco_encode_rle, generate_crop_boxes,
+                       is_box_near_crop_edge, mask_to_rle_pytorch,
+                       remove_small_regions, rle_to_mask, uncrop_boxes_xyxy,
+                       uncrop_masks, uncrop_points)
+from torchvision.ops.boxes import batched_nms, box_area
+
+from mmseg.registry import MODELS, TRANSFORMS
+
+# yapf: enable
+
+model_zoo = {
+    'base':
+    'https://download.openmmlab.com/mmsegmentation/v0.5/sam/sam_vit-base-p16_3rdparty_sa1b-1024x1024_20230413-78a25eed.pth',  # noqa
+    'large':
+    'https://download.openmmlab.com/mmsegmentation/v0.5/sam/sam_vit-large-p16_3rdparty_sa1b-1024x1024_20230413-940520da.pth',  # noqa
+    'huge':
+    'https://download.openmmlab.com/mmsegmentation/v0.5/sam/sam_vit-huge-p16_3rdparty_sa1b-1024x1024_20230413-faaf96f6.pth',  # noqa
+}
+
+
+class SAMInferencer:
+
+    def __init__(self, arch: str = 'base') -> None:
+        assert arch in ['base', 'large', 'huge']
+        self.model = self.init_model(arch)
+        self.transform = TRANSFORMS.build(
+            dict(
+                type='ResizeLongestSide',
+                target_length=max(self.model.image_encoder.img_size)))
+
+    def set_image(
+        self,
+        image: np.ndarray,
+        image_format: str = 'RGB',
+    ) -> None:
+        """Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method.
+
+        Arguments:
+          image (np.ndarray): The image for calculating masks. Expects an
+            image in HWC uint8 format, with pixel values in [0, 255].
+          image_format (str): The color format of the image, in ['RGB', 'BGR'].
+        """
+        assert image_format in [
+            'RGB',
+            'BGR',
+        ], f"image_format must be in ['RGB', 'BGR'], is {image_format}."
+        if image_format != self.model.image_format:
+            image = image[..., ::-1]
+
+        # Transform the image to the form expected by the model
+        input_image = self.transform.apply_image(image)
+        input_image_torch = torch.as_tensor(input_image, device=self.device)
+        input_image_torch = input_image_torch.permute(
+            2, 0, 1).contiguous()[None, :, :, :]
+
+        self.set_torch_image(input_image_torch, image.shape[:2])
+
+    @torch.no_grad()
+    def set_torch_image(
+        self,
+        transformed_image: torch.Tensor,
+        original_image_size: Tuple[int, ...],
+    ) -> None:
+        """Calculates the image embeddings for the provided image, allowing
+        masks to be predicted with the 'predict' method. Expects the input
+        image to be already transformed to the format expected by the model.
+
+        Arguments:
+          transformed_image (torch.Tensor): The input image, with shape
+            1x3xHxW, which has been transformed with ResizeLongestSide.
+          original_image_size (tuple(int, int)): The size of the image
+            before transformation, in (H, W) format.
+        """
+        assert (len(transformed_image.shape) == 4
+                and transformed_image.shape[1] == 3
+                and max(*transformed_image.shape[2:]) == max(
+                    self.model.image_encoder.img_size)
+                ), 'set_torch_image input must be BCHW with long side'
+        f' {self.model.image_encoder.img_size}.'
+        self.reset_image()
+
+        self.original_size = original_image_size
+        self.input_size = tuple(transformed_image.shape[-2:])
+        input_image = self.model.preprocess(transformed_image)
+        self.features = self.model.image_encoder(input_image)[0]
+        self.is_image_set = True
+
+    def predict(
+        self,
+        point_coords: Optional[np.ndarray] = None,
+        point_labels: Optional[np.ndarray] = None,
+        box: Optional[np.ndarray] = None,
+        mask_input: Optional[np.ndarray] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Predict masks for the given input prompts, using the currently set
+        image.
+
+        Arguments:
+          point_coords (np.ndarray or None): A Nx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (np.ndarray or None): A length N array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A length 4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form 1xHxW, where
+            for SAM, H=W=256.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+
+        Returns:
+          (np.ndarray): The output masks in CxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (np.ndarray): An array of length C containing the model's
+            predictions for the quality of each mask.
+          (np.ndarray): An array of shape CxHxW, where C is the number
+            of masks and H=W=256. These low resolution logits can be passed to
+            a subsequent iteration as mask input.
+        """ # noqa
+        if not self.is_image_set:
+            raise RuntimeError(
+                'An image must be set with .set_image(...) before mask'
+                'prediction.')
+
+        # Transform input prompts
+        coords_torch = None
+        labels_torch = None
+        box_torch = None
+        mask_input_torch = None
+
+        if point_coords is not None:
+            assert (
+                point_labels is not None
+            ), 'point_labels must be supplied if point_coords is supplied.'
+            point_coords = self.transform.apply_coords(point_coords,
+                                                       self.original_size)
+            coords_torch = torch.as_tensor(
+                point_coords, dtype=torch.float, device=self.device)
+            labels_torch = torch.as_tensor(
+                point_labels, dtype=torch.int, device=self.device)
+            coords_torch, labels_torch = coords_torch[
+                None, :, :], labels_torch[None, :]
+        if box is not None:
+            box = self.transform.apply_boxes(box, self.original_size)
+            box_torch = torch.as_tensor(
+                box, dtype=torch.float, device=self.device)
+            box_torch = box_torch[None, :]
+        if mask_input is not None:
+            mask_input_torch = torch.as_tensor(
+                mask_input, dtype=torch.float, device=self.device)
+            mask_input_torch = mask_input_torch[None, :, :, :]
+
+        masks, iou_predictions, low_res_masks = self.predict_torch(
+            coords_torch,
+            labels_torch,
+            box_torch,
+            mask_input_torch,
+            multimask_output,
+            return_logits=return_logits,
+        )
+
+        masks = masks[0].detach().cpu().numpy()
+        iou_predictions = iou_predictions[0].detach().cpu().numpy()
+        low_res_masks = low_res_masks[0].detach().cpu().numpy()
+        return masks, iou_predictions, low_res_masks
+
+    @torch.no_grad()
+    def predict_torch(
+        self,
+        point_coords: Optional[torch.Tensor],
+        point_labels: Optional[torch.Tensor],
+        boxes: Optional[torch.Tensor] = None,
+        mask_input: Optional[torch.Tensor] = None,
+        multimask_output: bool = True,
+        return_logits: bool = False,
+    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+        """Predict masks for the given input prompts, using the currently set
+        image. Input prompts are batched torch tensors and are expected to
+        already be transformed to the input frame using ResizeLongestSide.
+
+        Arguments:
+          point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the
+            model. Each point is in (X,Y) in pixels.
+          point_labels (torch.Tensor or None): A BxN array of labels for the
+            point prompts. 1 indicates a foreground point and 0 indicates a
+            background point.
+          box (np.ndarray or None): A Bx4 array given a box prompt to the
+            model, in XYXY format.
+          mask_input (np.ndarray): A low resolution mask input to the model, typically
+            coming from a previous prediction iteration. Has form Bx1xHxW, where
+            for SAM, H=W=256. Masks returned by a previous iteration of the
+            predict method do not need further transformation.
+          multimask_output (bool): If true, the model will return three masks.
+            For ambiguous input prompts (such as a single click), this will often
+            produce better masks than a single prediction. If only a single
+            mask is needed, the model's predicted quality score can be used
+            to select the best mask. For non-ambiguous prompts, such as multiple
+            input prompts, multimask_output=False can give better results.
+          return_logits (bool): If true, returns un-thresholded masks logits
+            instead of a binary mask.
+
+        Returns:
+          (torch.Tensor): The output masks in BxCxHxW format, where C is the
+            number of masks, and (H, W) is the original image size.
+          (torch.Tensor): An array of shape BxC containing the model's
+            predictions for the quality of each mask.
+          (torch.Tensor): An array of shape BxCxHxW, where C is the number
+            of masks and H=W=256. These low res logits can be passed to
+            a subsequent iteration as mask input.
+        """ # noqa
+        if not self.is_image_set:
+            raise RuntimeError(
+                'An image must be set with .set_image(...) before mask '
+                'prediction.')
+
+        if point_coords is not None:
+            points = (point_coords, point_labels)
+        else:
+            points = None
+
+        # Embed prompts
+        sparse_embeddings, dense_embeddings = self.model.prompt_encoder(
+            points=points,
+            boxes=boxes,
+            masks=mask_input,
+        )
+
+        # Predict masks
+        low_res_masks, iou_predictions = self.model.mask_decoder(
+            image_embeddings=self.features,
+            image_pe=self.model.prompt_encoder.get_dense_pe(),
+            sparse_prompt_embeddings=sparse_embeddings,
+            dense_prompt_embeddings=dense_embeddings,
+            multimask_output=multimask_output,
+        )
+
+        # Upscale the masks to the original image resolution
+        masks = self.model.postprocess_masks(low_res_masks, self.input_size,
+                                             self.original_size)
+
+        if not return_logits:
+            masks = masks > self.model.mask_threshold
+
+        return masks, iou_predictions, low_res_masks
+
+    def get_image_embedding(self) -> torch.Tensor:
+        """Returns the image embeddings for the currently set image, with shape
+        1xCxHxW, where C is the embedding dimension and (H,W) are the embedding
+        spatial dimension of SAM (typically C=256, H=W=64)."""
+        if not self.is_image_set:
+            raise RuntimeError(
+                'An image must be set with .set_image(...) to generate an '
+                'embedding.')
+        assert self.features is not None, 'Features must exist if an image has'
+        ' been set.'
+        return self.features
+
+    @property
+    def device(self) -> torch.device:
+        return self.model.device
+
+    def reset_image(self) -> None:
+        """Resets the currently set image."""
+        self.is_image_set = False
+        self.features = None
+        self.orig_h = None
+        self.orig_w = None
+        self.input_h = None
+        self.input_w = None
+
+    def init_model(self, arch: str):
+        model = MODELS.build(
+            dict(
+                type='SAM',
+                image_encoder_cfg=dict(
+                    type='mmpretrain.ViTSAM',
+                    arch=arch,
+                    img_size=1024,
+                    patch_size=16,
+                    out_channels=256,
+                    use_abs_pos=True,
+                    use_rel_pos=True,
+                    window_size=14,
+                ),
+                prompt_encoder_cfg=dict(
+                    type='PromptEncoder',
+                    embed_dim=256,
+                    image_embedding_size=(64, 64),
+                    input_image_size=(1024, 1024),
+                    mask_in_chans=16,
+                ),
+                mask_decoder_cfg=dict(
+                    type='MaskDecoder',
+                    num_multimask_outputs=3,
+                    transformer=dict(
+                        type='TwoWayTransformer',
+                        depth=2,
+                        embedding_dim=256,
+                        mlp_dim=2048,
+                        num_heads=8,
+                    ),
+                    transformer_dim=256,
+                    iou_head_depth=3,
+                    iou_head_hidden_dim=256,
+                )))
+        load_checkpoint(model, model_zoo.get(arch), strict=True)
+        if torch.cuda.is_available():
+            model = model.cuda()
+        return model
+
+
+class SamAutomaticMaskGenerator:
+
+    def __init__(
+        self,
+        arch: str = 'base',
+        points_per_side: Optional[int] = 32,
+        points_per_batch: int = 64,
+        pred_iou_thresh: float = 0.88,
+        stability_score_thresh: float = 0.95,
+        stability_score_offset: float = 1.0,
+        box_nms_thresh: float = 0.7,
+        crop_n_layers: int = 0,
+        crop_nms_thresh: float = 0.7,
+        crop_overlap_ratio: float = 512 / 1500,
+        crop_n_points_downscale_factor: int = 1,
+        point_grids: Optional[List[np.ndarray]] = None,
+        min_mask_region_area: int = 0,
+        output_mode: str = 'binary_mask',
+    ) -> None:
+        """Using a SAM model, generates masks for the entire image. Generates a
+        grid of point prompts over the image, then filters low quality and
+        duplicate masks. The default settings are chosen for SAM with a ViT-H
+        backbone.
+
+        Arguments:
+          arch (str): The SAM model to use for mask prediction.
+          points_per_side (int or None): The number of points to be sampled
+            along one side of the image. The total number of points is
+            points_per_side**2. If None, 'point_grids' must provide explicit
+            point sampling.
+          points_per_batch (int): Sets the number of points run simultaneously
+            by the model. Higher numbers may be faster but use more GPU memory.
+          pred_iou_thresh (float): A filtering threshold in [0,1], using the
+            model's predicted mask quality.
+          stability_score_thresh (float): A filtering threshold in [0,1], using
+            the stability of the mask under changes to the cutoff used to binarize
+            the model's mask predictions.
+          stability_score_offset (float): The amount to shift the cutoff when
+            calculated the stability score.
+          box_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks.
+          crops_n_layers (int): If >0, mask prediction will be run again on
+            crops of the image. Sets the number of layers to run, where each
+            layer has 2**i_layer number of image crops.
+          crops_nms_thresh (float): The box IoU cutoff used by non-maximal
+            suppression to filter duplicate masks between different crops.
+          crop_overlap_ratio (float): Sets the degree to which crops overlap.
+            In the first crop layer, crops will overlap by this fraction of
+            the image length. Later layers with more crops scale down this overlap.
+          crop_n_points_downscale_factor (int): The number of points-per-side
+            sampled in layer n is scaled down by crop_n_points_downscale_factor**n.
+          point_grids (list(np.ndarray) or None): A list over explicit grids
+            of points used for sampling, normalized to [0,1]. The nth grid in the
+            list is used in the nth crop layer. Exclusive with points_per_side.
+          min_mask_region_area (int): If >0, postprocessing will be applied
+            to remove disconnected regions and holes in masks with area smaller
+            than min_mask_region_area. Requires opencv.
+          output_mode (str): The form masks are returned in. Can be 'binary_mask',
+            'uncompressed_rle', or 'coco_rle'. 'coco_rle' requires pycocotools.
+            For large resolutions, 'binary_mask' may consume large amounts of
+            memory.
+        """ # noqa
+
+        assert (points_per_side is None) != (
+            point_grids is None
+        ), 'Exactly one of points_per_side or point_grid must be provided.'
+        if points_per_side is not None:
+            self.point_grids = build_all_layer_point_grids(
+                points_per_side,
+                crop_n_layers,
+                crop_n_points_downscale_factor,
+            )
+        elif point_grids is not None:
+            self.point_grids = point_grids
+        else:
+            raise ValueError(
+                "Can't have both points_per_side and point_grid be None.")
+
+        assert output_mode in [
+            'binary_mask',
+            'uncompressed_rle',
+            'coco_rle',
+        ], f'Unknown output_mode {output_mode}.'
+        if output_mode == 'coco_rle':
+            from pycocotools import \
+                mask as mask_utils  # type: ignore # noqa: F401
+
+        if min_mask_region_area > 0:
+            import cv2  # type: ignore # noqa: F401
+
+        self.predictor = SAMInferencer(arch)
+        self.points_per_batch = points_per_batch
+        self.pred_iou_thresh = pred_iou_thresh
+        self.stability_score_thresh = stability_score_thresh
+        self.stability_score_offset = stability_score_offset
+        self.box_nms_thresh = box_nms_thresh
+        self.crop_n_layers = crop_n_layers
+        self.crop_nms_thresh = crop_nms_thresh
+        self.crop_overlap_ratio = crop_overlap_ratio
+        self.crop_n_points_downscale_factor = crop_n_points_downscale_factor
+        self.min_mask_region_area = min_mask_region_area
+        self.output_mode = output_mode
+
+    @torch.no_grad()
+    def generate(self, image: np.ndarray) -> List[Dict[str, Any]]:
+        """Generates masks for the given image.
+
+        Arguments:
+          image (np.ndarray): The image to generate masks for, in HWC uint8 format.
+
+        Returns:
+           list(dict(str, any)): A list over records for masks. Each record is
+             a dict containing the following keys:
+               segmentation (dict(str, any) or np.ndarray): The mask. If
+                 output_mode='binary_mask', is an array of shape HW. Otherwise,
+                 is a dictionary containing the RLE.
+               bbox (list(float)): The box around the mask, in XYWH format.
+               area (int): The area in pixels of the mask.
+               predicted_iou (float): The model's own prediction of the mask's
+                 quality. This is filtered by the pred_iou_thresh parameter.
+               point_coords (list(list(float))): The point coordinates input
+                 to the model to generate this mask.
+               stability_score (float): A measure of the mask's quality. This
+                 is filtered on using the stability_score_thresh parameter.
+               crop_box (list(float)): The crop of the image used to generate
+                 the mask, given in XYWH format.
+        """ # noqa
+
+        # Generate masks
+        mask_data = self._generate_masks(image)
+
+        # Filter small disconnected regions and holes in masks
+        if self.min_mask_region_area > 0:
+            mask_data = self.postprocess_small_regions(
+                mask_data,
+                self.min_mask_region_area,
+                max(self.box_nms_thresh, self.crop_nms_thresh),
+            )
+
+        # Encode masks
+        if self.output_mode == 'coco_rle':
+            mask_data['segmentations'] = [
+                coco_encode_rle(rle) for rle in mask_data['rles']
+            ]
+        elif self.output_mode == 'binary_mask':
+            mask_data['segmentations'] = [
+                rle_to_mask(rle) for rle in mask_data['rles']
+            ]
+        else:
+            mask_data['segmentations'] = mask_data['rles']
+
+        # Write mask records
+        curr_anns = []
+        for idx in range(len(mask_data['segmentations'])):
+            ann = {
+                'segmentation':
+                mask_data['segmentations'][idx],
+                'area':
+                area_from_rle(mask_data['rles'][idx]),
+                'bbox':
+                box_xyxy_to_xywh(mask_data['boxes'][idx]).tolist(),
+                'predicted_iou':
+                mask_data['iou_preds'][idx].item(),
+                'point_coords': [mask_data['points'][idx].tolist()],
+                'stability_score':
+                mask_data['stability_score'][idx].item(),
+                'crop_box':
+                box_xyxy_to_xywh(mask_data['crop_boxes'][idx]).tolist(),
+            }
+            curr_anns.append(ann)
+
+        return curr_anns
+
+    def _generate_masks(self, image: np.ndarray) -> MaskData:
+        orig_size = image.shape[:2]
+        crop_boxes, layer_idxs = generate_crop_boxes(orig_size,
+                                                     self.crop_n_layers,
+                                                     self.crop_overlap_ratio)
+
+        # Iterate over image crops
+        data = MaskData()
+        for crop_box, layer_idx in zip(crop_boxes, layer_idxs):
+            crop_data = self._process_crop(image, crop_box, layer_idx,
+                                           orig_size)
+            data.cat(crop_data)
+
+        # Remove duplicate masks between crops
+        if len(crop_boxes) > 1:
+            # Prefer masks from smaller crops
+            scores = 1 / box_area(data['crop_boxes'])
+            scores = scores.to(data['boxes'].device)
+            keep_by_nms = batched_nms(
+                data['boxes'].float(),
+                scores,
+                torch.zeros(len(data['boxes'])),  # categories
+                iou_threshold=self.crop_nms_thresh,
+            )
+            data.filter(keep_by_nms)
+
+        data.to_numpy()
+        return data
+
+    def _process_crop(
+        self,
+        image: np.ndarray,
+        crop_box: List[int],
+        crop_layer_idx: int,
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        # Crop the image and calculate embeddings
+        x0, y0, x1, y1 = crop_box
+        cropped_im = image[y0:y1, x0:x1, :]
+        cropped_im_size = cropped_im.shape[:2]
+        self.predictor.set_image(cropped_im)
+
+        # Get points for this crop
+        points_scale = np.array(cropped_im_size)[None, ::-1]
+        points_for_image = self.point_grids[crop_layer_idx] * points_scale
+
+        # Generate masks for this crop in batches
+        data = MaskData()
+        for (points, ) in batch_iterator(self.points_per_batch,
+                                         points_for_image):
+            batch_data = self._process_batch(points, cropped_im_size, crop_box,
+                                             orig_size)
+            data.cat(batch_data)
+            del batch_data
+        self.predictor.reset_image()
+
+        # Remove duplicates within this crop.
+        keep_by_nms = batched_nms(
+            data['boxes'].float(),
+            data['iou_preds'],
+            torch.zeros(len(data['boxes'])),  # categories
+            iou_threshold=self.box_nms_thresh,
+        )
+        data.filter(keep_by_nms)
+
+        # Return to the original image frame
+        data['boxes'] = uncrop_boxes_xyxy(data['boxes'], crop_box)
+        data['points'] = uncrop_points(data['points'], crop_box)
+        data['crop_boxes'] = torch.tensor(
+            [crop_box for _ in range(len(data['rles']))])
+
+        return data
+
+    def _process_batch(
+        self,
+        points: np.ndarray,
+        im_size: Tuple[int, ...],
+        crop_box: List[int],
+        orig_size: Tuple[int, ...],
+    ) -> MaskData:
+        orig_h, orig_w = orig_size
+
+        # Run model on this batch
+        transformed_points = self.predictor.transform.apply_coords(
+            points, im_size)
+        in_points = torch.as_tensor(
+            transformed_points, device=self.predictor.device)
+        in_labels = torch.ones(
+            in_points.shape[0], dtype=torch.int, device=in_points.device)
+        masks, iou_preds, _ = self.predictor.predict_torch(
+            in_points[:, None, :],
+            in_labels[:, None],
+            multimask_output=True,
+            return_logits=True,
+        )
+
+        # Serialize predictions and store in MaskData
+        data = MaskData(
+            masks=masks.flatten(0, 1),
+            iou_preds=iou_preds.flatten(0, 1),
+            points=torch.as_tensor(points.repeat(masks.shape[1], axis=0)),
+        )
+        del masks
+
+        # Filter by predicted IoU
+        if self.pred_iou_thresh > 0.0:
+            keep_mask = data['iou_preds'] > self.pred_iou_thresh
+            data.filter(keep_mask)
+
+        # Calculate stability score
+        data['stability_score'] = calculate_stability_score(
+            data['masks'], self.predictor.model.mask_threshold,
+            self.stability_score_offset)
+        if self.stability_score_thresh > 0.0:
+            keep_mask = data['stability_score'] >= self.stability_score_thresh
+            data.filter(keep_mask)
+
+        # Threshold masks and calculate boxes
+        data['masks'] = data['masks'] > self.predictor.model.mask_threshold
+        data['boxes'] = batched_mask_to_box(data['masks'])
+
+        # Filter boxes that touch crop boundaries
+        keep_mask = ~is_box_near_crop_edge(data['boxes'], crop_box,
+                                           [0, 0, orig_w, orig_h])
+        if not torch.all(keep_mask):
+            data.filter(keep_mask)
+
+        # Compress to RLE
+        data['masks'] = uncrop_masks(data['masks'], crop_box, orig_h, orig_w)
+        data['rles'] = mask_to_rle_pytorch(data['masks'])
+        del data['masks']
+
+        return data
+
+    @staticmethod
+    def postprocess_small_regions(mask_data: MaskData, min_area: int,
+                                  nms_thresh: float) -> MaskData:
+        """Removes small disconnected regions and holes in masks, then reruns
+        box NMS to remove any new duplicates.
+
+        Edits mask_data in place.
+
+        Requires open-cv as a dependency.
+        """
+        if len(mask_data['rles']) == 0:
+            return mask_data
+
+        # Filter small disconnected regions and holes
+        new_masks = []
+        scores = []
+        for rle in mask_data['rles']:
+            mask = rle_to_mask(rle)
+
+            mask, changed = remove_small_regions(mask, min_area, mode='holes')
+            unchanged = not changed
+            mask, changed = remove_small_regions(
+                mask, min_area, mode='islands')
+            unchanged = unchanged and not changed
+
+            new_masks.append(torch.as_tensor(mask).unsqueeze(0))
+            # Give score=0 to changed masks and score=1 to unchanged masks
+            # so NMS will prefer ones that didn't need postprocessing
+            scores.append(float(unchanged))
+
+        # Recalculate boxes and remove any new duplicates
+        masks = torch.cat(new_masks, dim=0)
+        boxes = batched_mask_to_box(masks)
+        keep_by_nms = batched_nms(
+            boxes.float(),
+            torch.as_tensor(scores),
+            torch.zeros(len(boxes)),  # categories
+            iou_threshold=nms_thresh,
+        )
+
+        # Only recalculate RLEs for masks that have changed
+        for i_mask in keep_by_nms:
+            if scores[i_mask] == 0.0:
+                mask_torch = masks[i_mask].unsqueeze(0)
+                mask_data['rles'][i_mask] = mask_to_rle_pytorch(mask_torch)[0]
+                mask_data['boxes'][i_mask] = boxes[
+                    i_mask]  # update res directly
+        mask_data.filter(keep_by_nms)
+
+        return mask_data
diff --git a/projects/sam_inference_demo/sam/utils/__init__.py b/projects/sam_inference_demo/sam/utils/__init__.py
new file mode 100644
index 0000000000..5d33e33aee
--- /dev/null
+++ b/projects/sam_inference_demo/sam/utils/__init__.py
@@ -0,0 +1,2 @@
+from .amg import *  # noqa: F403 F401
+from .transforms import ResizeLongestSide  # noqa: F403 F401
diff --git a/projects/sam_inference_demo/sam/utils/amg.py b/projects/sam_inference_demo/sam/utils/amg.py
new file mode 100644
index 0000000000..3ba359901f
--- /dev/null
+++ b/projects/sam_inference_demo/sam/utils/amg.py
@@ -0,0 +1,355 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+# https://github.com/facebookresearch/segment-anything
+
+import math
+from copy import deepcopy
+from itertools import product
+from typing import Any, Dict, Generator, ItemsView, List, Tuple
+
+import numpy as np
+import torch
+
+
+class MaskData:
+    """A structure for storing masks and their related data in batched format.
+
+    Implements basic filtering and concatenation.
+    """
+
+    def __init__(self, **kwargs) -> None:
+        for v in kwargs.values():
+            assert isinstance(
+                v, (list, np.ndarray, torch.Tensor)
+            ), 'MaskData only supports list, numpy arrays, and torch tensors.'
+        self._stats = dict(**kwargs)
+
+    def __setitem__(self, key: str, item: Any) -> None:
+        assert isinstance(
+            item, (list, np.ndarray, torch.Tensor)
+        ), 'MaskData only supports list, numpy arrays, and torch tensors.'
+        self._stats[key] = item
+
+    def __delitem__(self, key: str) -> None:
+        del self._stats[key]
+
+    def __getitem__(self, key: str) -> Any:
+        return self._stats[key]
+
+    def items(self) -> ItemsView[str, Any]:
+        return self._stats.items()
+
+    def filter(self, keep: torch.Tensor) -> None:
+        for k, v in self._stats.items():
+            if v is None:
+                self._stats[k] = None
+            elif isinstance(v, torch.Tensor):
+                self._stats[k] = v[torch.as_tensor(keep, device=v.device)]
+            elif isinstance(v, np.ndarray):
+                self._stats[k] = v[keep.detach().cpu().numpy()]
+            elif isinstance(v, list) and keep.dtype == torch.bool:
+                self._stats[k] = [a for i, a in enumerate(v) if keep[i]]
+            elif isinstance(v, list):
+                self._stats[k] = [v[i] for i in keep]
+            else:
+                raise TypeError(
+                    f'MaskData key {k} has an unsupported type {type(v)}.')
+
+    def cat(self, new_stats: 'MaskData') -> None:
+        for k, v in new_stats.items():
+            if k not in self._stats or self._stats[k] is None:
+                self._stats[k] = deepcopy(v)
+            elif isinstance(v, torch.Tensor):
+                self._stats[k] = torch.cat([self._stats[k], v], dim=0)
+            elif isinstance(v, np.ndarray):
+                self._stats[k] = np.concatenate([self._stats[k], v], axis=0)
+            elif isinstance(v, list):
+                self._stats[k] = self._stats[k] + deepcopy(v)
+            else:
+                raise TypeError(
+                    f'MaskData key {k} has an unsupported type {type(v)}.')
+
+    def to_numpy(self) -> None:
+        for k, v in self._stats.items():
+            if isinstance(v, torch.Tensor):
+                self._stats[k] = v.detach().cpu().numpy()
+
+
+def is_box_near_crop_edge(boxes: torch.Tensor,
+                          crop_box: List[int],
+                          orig_box: List[int],
+                          atol: float = 20.0) -> torch.Tensor:
+    """Filter masks at the edge of a crop, but not at the edge of the original
+    image."""
+    crop_box_torch = torch.as_tensor(
+        crop_box, dtype=torch.float, device=boxes.device)
+    orig_box_torch = torch.as_tensor(
+        orig_box, dtype=torch.float, device=boxes.device)
+    boxes = uncrop_boxes_xyxy(boxes, crop_box).float()
+    near_crop_edge = torch.isclose(
+        boxes, crop_box_torch[None, :], atol=atol, rtol=0)
+    near_image_edge = torch.isclose(
+        boxes, orig_box_torch[None, :], atol=atol, rtol=0)
+    near_crop_edge = torch.logical_and(near_crop_edge, ~near_image_edge)
+    return torch.any(near_crop_edge, dim=1)
+
+
+def box_xyxy_to_xywh(box_xyxy: torch.Tensor) -> torch.Tensor:
+    box_xywh = deepcopy(box_xyxy)
+    box_xywh[2] = box_xywh[2] - box_xywh[0]
+    box_xywh[3] = box_xywh[3] - box_xywh[1]
+    return box_xywh
+
+
+def batch_iterator(batch_size: int, *args) -> Generator[List[Any], None, None]:
+    assert len(args) > 0 and all(
+        len(a) == len(args[0]) for a in
+        args), 'Batched iteration must have inputs of all the same size.'
+    n_batches = len(args[0]) // batch_size + int(
+        len(args[0]) % batch_size != 0)
+    for b in range(n_batches):
+        yield [arg[b * batch_size:(b + 1) * batch_size] for arg in args]
+
+
+def mask_to_rle_pytorch(tensor: torch.Tensor) -> List[Dict[str, Any]]:
+    """Encodes masks to an uncompressed RLE, in the format expected by pycoco
+    tools."""
+    # Put in fortran order and flatten h,w
+    b, h, w = tensor.shape
+    tensor = tensor.permute(0, 2, 1).flatten(1)
+
+    # Compute change indices
+    diff = tensor[:, 1:] ^ tensor[:, :-1]
+    change_indices = diff.nonzero()
+
+    # Encode run length
+    out = []
+    for i in range(b):
+        cur_idxs = change_indices[change_indices[:, 0] == i, 1]
+        cur_idxs = torch.cat([
+            torch.tensor([0], dtype=cur_idxs.dtype, device=cur_idxs.device),
+            cur_idxs + 1,
+            torch.tensor([h * w], dtype=cur_idxs.dtype,
+                         device=cur_idxs.device),
+        ])
+        btw_idxs = cur_idxs[1:] - cur_idxs[:-1]
+        counts = [] if tensor[i, 0] == 0 else [0]
+        counts.extend(btw_idxs.detach().cpu().tolist())
+        out.append({'size': [h, w], 'counts': counts})
+    return out
+
+
+def rle_to_mask(rle: Dict[str, Any]) -> np.ndarray:
+    """Compute a binary mask from an uncompressed RLE."""
+    h, w = rle['size']
+    mask = np.empty(h * w, dtype=bool)
+    idx = 0
+    parity = False
+    for count in rle['counts']:
+        mask[idx:idx + count] = parity
+        idx += count
+        parity ^= True
+    mask = mask.reshape(w, h)
+    return mask.transpose()  # Put in C order
+
+
+def area_from_rle(rle: Dict[str, Any]) -> int:
+    return sum(rle['counts'][1::2])
+
+
+def calculate_stability_score(masks: torch.Tensor, mask_threshold: float,
+                              threshold_offset: float) -> torch.Tensor:
+    """Computes the stability score for a batch of masks.
+
+    The stability score is the IoU between the binary masks obtained by
+    thresholding the predicted mask logits at high and low values.
+    """
+    # One mask is always contained inside the other.
+    # Save memory by preventing unnecessary cast to torch.int64
+    intersections = ((masks > (mask_threshold + threshold_offset)).sum(
+        -1, dtype=torch.int16).sum(-1, dtype=torch.int32))
+    unions = ((masks > (mask_threshold - threshold_offset)).sum(
+        -1, dtype=torch.int16).sum(-1, dtype=torch.int32))
+    return intersections / unions
+
+
+def build_point_grid(n_per_side: int) -> np.ndarray:
+    """Generates a 2D grid of points evenly spaced in [0,1]x[0,1]."""
+    offset = 1 / (2 * n_per_side)
+    points_one_side = np.linspace(offset, 1 - offset, n_per_side)
+    points_x = np.tile(points_one_side[None, :], (n_per_side, 1))
+    points_y = np.tile(points_one_side[:, None], (1, n_per_side))
+    points = np.stack([points_x, points_y], axis=-1).reshape(-1, 2)
+    return points
+
+
+def build_all_layer_point_grids(n_per_side: int, n_layers: int,
+                                scale_per_layer: int) -> List[np.ndarray]:
+    """Generates point grids for all crop layers."""
+    points_by_layer = []
+    for i in range(n_layers + 1):
+        n_points = int(n_per_side / (scale_per_layer**i))
+        points_by_layer.append(build_point_grid(n_points))
+    return points_by_layer
+
+
+def generate_crop_boxes(
+        im_size: Tuple[int, ...], n_layers: int,
+        overlap_ratio: float) -> Tuple[List[List[int]], List[int]]:
+    """Generates a list of crop boxes of different sizes.
+
+    Each layer has (2**i)**2 boxes for the ith layer.
+    """
+    crop_boxes, layer_idxs = [], []
+    im_h, im_w = im_size
+    short_side = min(im_h, im_w)
+
+    # Original image
+    crop_boxes.append([0, 0, im_w, im_h])
+    layer_idxs.append(0)
+
+    def crop_len(orig_len, n_crops, overlap):
+        return int(math.ceil((overlap * (n_crops - 1) + orig_len) / n_crops))
+
+    for i_layer in range(n_layers):
+        n_crops_per_side = 2**(i_layer + 1)
+        overlap = int(overlap_ratio * short_side * (2 / n_crops_per_side))
+
+        crop_w = crop_len(im_w, n_crops_per_side, overlap)
+        crop_h = crop_len(im_h, n_crops_per_side, overlap)
+
+        crop_box_x0 = [
+            int((crop_w - overlap) * i) for i in range(n_crops_per_side)
+        ]
+        crop_box_y0 = [
+            int((crop_h - overlap) * i) for i in range(n_crops_per_side)
+        ]
+
+        # Crops in XYWH format
+        for x0, y0 in product(crop_box_x0, crop_box_y0):
+            box = [x0, y0, min(x0 + crop_w, im_w), min(y0 + crop_h, im_h)]
+            crop_boxes.append(box)
+            layer_idxs.append(i_layer + 1)
+
+    return crop_boxes, layer_idxs
+
+
+def uncrop_boxes_xyxy(boxes: torch.Tensor,
+                      crop_box: List[int]) -> torch.Tensor:
+    x0, y0, _, _ = crop_box
+    offset = torch.tensor([[x0, y0, x0, y0]], device=boxes.device)
+    # Check if boxes has a channel dimension
+    if len(boxes.shape) == 3:
+        offset = offset.unsqueeze(1)
+    return boxes + offset
+
+
+def uncrop_points(points: torch.Tensor, crop_box: List[int]) -> torch.Tensor:
+    x0, y0, _, _ = crop_box
+    offset = torch.tensor([[x0, y0]], device=points.device)
+    # Check if points has a channel dimension
+    if len(points.shape) == 3:
+        offset = offset.unsqueeze(1)
+    return points + offset
+
+
+def uncrop_masks(masks: torch.Tensor, crop_box: List[int], orig_h: int,
+                 orig_w: int) -> torch.Tensor:
+    x0, y0, x1, y1 = crop_box
+    if x0 == 0 and y0 == 0 and x1 == orig_w and y1 == orig_h:
+        return masks
+    # Coordinate transform masks
+    pad_x, pad_y = orig_w - (x1 - x0), orig_h - (y1 - y0)
+    pad = (x0, pad_x - x0, y0, pad_y - y0)
+    return torch.nn.functional.pad(masks, pad, value=0)
+
+
+def remove_small_regions(mask: np.ndarray, area_thresh: float,
+                         mode: str) -> Tuple[np.ndarray, bool]:
+    """Removes small disconnected regions and holes in a mask.
+
+    Returns the mask and an indicator of if the mask has been modified.
+    """
+    import cv2  # type: ignore
+
+    assert mode in ['holes', 'islands']
+    correct_holes = mode == 'holes'
+    working_mask = (correct_holes ^ mask).astype(np.uint8)
+    n_labels, regions, stats, _ = cv2.connectedComponentsWithStats(
+        working_mask, 8)
+    sizes = stats[:, -1][1:]  # Row 0 is background label
+    small_regions = [i + 1 for i, s in enumerate(sizes) if s < area_thresh]
+    if len(small_regions) == 0:
+        return mask, False
+    fill_labels = [0] + small_regions
+    if not correct_holes:
+        fill_labels = [i for i in range(n_labels) if i not in fill_labels]
+        # If every region is below threshold, keep largest
+        if len(fill_labels) == 0:
+            fill_labels = [int(np.argmax(sizes)) + 1]
+    mask = np.isin(regions, fill_labels)
+    return mask, True
+
+
+def coco_encode_rle(uncompressed_rle: Dict[str, Any]) -> Dict[str, Any]:
+    from pycocotools import mask as mask_utils  # type: ignore
+
+    h, w = uncompressed_rle['size']
+    rle = mask_utils.frPyObjects(uncompressed_rle, h, w)
+    rle['counts'] = rle['counts'].decode(
+        'utf-8')  # Necessary to serialize with json
+    return rle
+
+
+def batched_mask_to_box(masks: torch.Tensor) -> torch.Tensor:
+    """Calculates boxes in XYXY format around masks.
+
+    Return [0,0,0,0] for an empty mask. For input shape C1xC2x...xHxW, the
+    output shape is C1xC2x...x4.
+    """
+    # torch.max below raises an error on empty inputs, just skip in this case
+    if torch.numel(masks) == 0:
+        return torch.zeros(*masks.shape[:-2], 4, device=masks.device)
+
+    # Normalize shape to CxHxW
+    shape = masks.shape
+    h, w = shape[-2:]
+    if len(shape) > 2:
+        masks = masks.flatten(0, -3)
+    else:
+        masks = masks.unsqueeze(0)
+
+    # Get top and bottom edges
+    in_height, _ = torch.max(masks, dim=-1)
+    in_height_coords = in_height * torch.arange(
+        h, device=in_height.device)[None, :]
+    bottom_edges, _ = torch.max(in_height_coords, dim=-1)
+    in_height_coords = in_height_coords + h * (~in_height)
+    top_edges, _ = torch.min(in_height_coords, dim=-1)
+
+    # Get left and right edges
+    in_width, _ = torch.max(masks, dim=-2)
+    in_width_coords = in_width * torch.arange(
+        w, device=in_width.device)[None, :]
+    right_edges, _ = torch.max(in_width_coords, dim=-1)
+    in_width_coords = in_width_coords + w * (~in_width)
+    left_edges, _ = torch.min(in_width_coords, dim=-1)
+
+    # If the mask is empty the right edge will be to the left of the left edge.
+    # Replace these boxes with [0, 0, 0, 0]
+    empty_filter = (right_edges < left_edges) | (bottom_edges < top_edges)
+    out = torch.stack([left_edges, top_edges, right_edges, bottom_edges],
+                      dim=-1)
+    out = out * (~empty_filter).unsqueeze(-1)
+
+    # Return to original shape
+    if len(shape) > 2:
+        out = out.reshape(*shape[:-2], 4)
+    else:
+        out = out[0]
+
+    return out
diff --git a/projects/sam_inference_demo/sam/utils/transforms.py b/projects/sam_inference_demo/sam/utils/transforms.py
new file mode 100644
index 0000000000..484fd6691c
--- /dev/null
+++ b/projects/sam_inference_demo/sam/utils/transforms.py
@@ -0,0 +1,110 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+
+# This source code is licensed under the license found in the
+# LICENSE file in the root directory of this source tree.
+
+from copy import deepcopy
+from typing import Tuple
+
+import numpy as np
+import torch
+from torch.nn import functional as F
+from torchvision.transforms.functional import resize  # type: ignore
+from torchvision.transforms.functional import to_pil_image
+
+from mmseg.registry import TRANSFORMS
+
+
+@TRANSFORMS.register_module()
+class ResizeLongestSide:
+    """Resizes images to longest side 'target_length', as well as provides
+    methods for resizing coordinates and boxes.
+
+    Provides methods for transforming both numpy array and batched torch
+    tensors.
+    """
+
+    def __init__(self, target_length: int) -> None:
+        self.target_length = target_length
+
+    def apply_image(self, image: np.ndarray) -> np.ndarray:
+        """Expects a numpy array with shape HxWxC in uint8 format."""
+        target_size = self.get_preprocess_shape(image.shape[0], image.shape[1],
+                                                self.target_length)
+        return np.array(resize(to_pil_image(image), target_size))
+
+    def apply_coords(self, coords: np.ndarray,
+                     original_size: Tuple[int, ...]) -> np.ndarray:
+        """Expects a numpy array of length 2 in the final dimension.
+
+        Requires the original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(original_size[0],
+                                                 original_size[1],
+                                                 self.target_length)
+        coords = deepcopy(coords).astype(float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+
+    def apply_boxes(self, boxes: np.ndarray,
+                    original_size: Tuple[int, ...]) -> np.ndarray:
+        """Expects a numpy array shape Bx4.
+
+        Requires the original image size in (H, W) format.
+        """
+        boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+
+    def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor:
+        """Expects batched images with shape BxCxHxW and float format.
+
+        This transformation may not exactly match apply_image. apply_image is
+        the transformation expected by the model.
+        """
+        # Expects an image in BCHW format. May not exactly match apply_image.
+        target_size = self.get_preprocess_shape(image.shape[0], image.shape[1],
+                                                self.target_length)
+        return F.interpolate(
+            image,
+            target_size,
+            mode='bilinear',
+            align_corners=False,
+            antialias=True)
+
+    def apply_coords_torch(self, coords: torch.Tensor,
+                           original_size: Tuple[int, ...]) -> torch.Tensor:
+        """Expects a torch tensor with length 2 in the last dimension.
+
+        Requires the original image size in (H, W) format.
+        """
+        old_h, old_w = original_size
+        new_h, new_w = self.get_preprocess_shape(original_size[0],
+                                                 original_size[1],
+                                                 self.target_length)
+        coords = deepcopy(coords).to(torch.float)
+        coords[..., 0] = coords[..., 0] * (new_w / old_w)
+        coords[..., 1] = coords[..., 1] * (new_h / old_h)
+        return coords
+
+    def apply_boxes_torch(self, boxes: torch.Tensor,
+                          original_size: Tuple[int, ...]) -> torch.Tensor:
+        """Expects a torch tensor with shape Bx4.
+
+        Requires the original image size in (H, W) format.
+        """
+        boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size)
+        return boxes.reshape(-1, 4)
+
+    @staticmethod
+    def get_preprocess_shape(oldh: int, oldw: int,
+                             long_side_length: int) -> Tuple[int, int]:
+        """Compute the output size given input size and target long side
+        length."""
+        scale = long_side_length * 1.0 / max(oldh, oldw)
+        newh, neww = oldh * scale, oldw * scale
+        neww = int(neww + 0.5)
+        newh = int(newh + 0.5)
+        return (newh, neww)
diff --git a/projects/sam_inference_demo/sam_image_demo.ipynb b/projects/sam_inference_demo/sam_image_demo.ipynb
new file mode 100644
index 0000000000..1cb433fae9
--- /dev/null
+++ b/projects/sam_inference_demo/sam_image_demo.ipynb
@@ -0,0 +1,122 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import matplotlib.pyplot as plt\n",
+    "import cv2\n",
+    "\n",
+    "import sam # noqa: F401\n",
+    "from sam.sam_inferencer import SAMInferencer\n",
+    "\n",
+    "\n",
+    "def show_mask(mask, ax, random_color=False):\n",
+    "    if random_color:\n",
+    "        color = np.concatenate([np.random.random(3), np.array([0.6])], axis=0)\n",
+    "    else:\n",
+    "        color = np.array([30/255, 144/255, 255/255, 0.6])\n",
+    "    h, w = mask.shape[-2:]\n",
+    "    mask_image = mask.reshape(h, w, 1) * color.reshape(1, 1, -1)\n",
+    "    ax.imshow(mask_image)\n",
+    "    \n",
+    "def show_points(coords, labels, ax, marker_size=375):\n",
+    "    pos_points = coords[labels==1]\n",
+    "    neg_points = coords[labels==0]\n",
+    "    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)\n",
+    "    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='*', s=marker_size, edgecolor='white', linewidth=1.25)   \n",
+    "    \n",
+    "def show_box(box, ax):\n",
+    "    x0, y0 = box[0], box[1]\n",
+    "    w, h = box[2] - box[0], box[3] - box[1]\n",
+    "    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor='green', facecolor=(0,0,0,0), lw=2))\n",
+    "\n",
+    "image = cv2.imread('../../demo/demo.png')\n",
+    "image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)\n",
+    "plt.figure(figsize=(10,10))\n",
+    "plt.imshow(image)\n",
+    "plt.axis('on')\n",
+    "plt.show()\n",
+    "print(image.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inferencer = SAMInferencer(arch='huge')\n",
+    "inferencer.set_image(image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_point = np.array([[280, 230], [500, 300]])\n",
+    "input_label = np.array([1, 1])\n",
+    "plt.figure(figsize=(10,10))\n",
+    "plt.imshow(image)\n",
+    "show_points(input_point, input_label, plt.gca())\n",
+    "plt.axis('on')\n",
+    "plt.show()  "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "masks, scores, logits = inferencer.predict(\n",
+    "    point_coords=input_point,\n",
+    "    point_labels=input_label,\n",
+    "    multimask_output=True,\n",
+    ")\n",
+    "for i, (mask, score) in enumerate(zip(masks, scores)):\n",
+    "    plt.figure(figsize=(10,10))\n",
+    "    plt.imshow(image)\n",
+    "    show_mask(mask, plt.gca(), random_color=True)\n",
+    "    show_points(input_point, input_label, plt.gca())\n",
+    "    plt.title(f\"Mask {i+1}, Score: {score:.3f}\", fontsize=18)\n",
+    "    plt.axis('off')\n",
+    "    plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pt1.13",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.9"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
diff --git a/projects/van/README.md b/projects/van/README.md
new file mode 100644
index 0000000000..be0ba362fa
--- /dev/null
+++ b/projects/van/README.md
@@ -0,0 +1,101 @@
+# Visual Attention Network (VAN) for Segmentation
+
+This repo is a PyTorch implementation of applying **VAN** (**Visual Attention Network**) to semantic segmentation.
+
+The code is an integration from [VAN-Segmentation](https://github.com/Visual-Attention-Network/VAN-Segmentation/blob/main/README.md?plain=1)
+
+More details can be found in [**Visual Attention Network**](https://arxiv.org/abs/2202.09741).
+
+## Citation
+
+```bib
+@article{guo2022visual,
+  title={Visual Attention Network},
+  author={Guo, Meng-Hao and Lu, Cheng-Ze and Liu, Zheng-Ning and Cheng, Ming-Ming and Hu, Shi-Min},
+  journal={arXiv preprint arXiv:2202.09741},
+  year={2022}
+}
+```
+
+## Results
+
+**Notes**: Pre-trained models can be found in [TsingHua Cloud](https://cloud.tsinghua.edu.cn/d/0100f0cea37d41ba8d08/).
+
+Results can be found in [VAN-Segmentation](https://github.com/Visual-Attention-Network/VAN-Segmentation/blob/main/README.md?plain=1)
+
+We provide evaluation results of the converted weights.
+
+| Method  |   Backbone   | mIoU  |                                                                    Download                                                                    |
+| :-----: | :----------: | :---: | :--------------------------------------------------------------------------------------------------------------------------------------------: |
+| UPerNet |    VAN-B2    | 49.35 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b2-in1kpre_upernet_3rdparty_512x512-ade20k_20230522-19c58aee.pth)  |
+| UPerNet |    VAN-B3    | 49.71 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b3-in1kpre_upernet_3rdparty_512x512-ade20k_20230522-653bd6b7.pth)  |
+| UPerNet |    VAN-B4    | 51.56 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b4-in1kpre_upernet_3rdparty_512x512-ade20k_20230522-653bd6b7.pth)  |
+| UPerNet | VAN-B4-in22k | 52.61 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b4-in22kpre_upernet_3rdparty_512x512-ade20k_20230522-4a4d744a.pth) |
+| UPerNet | VAN-B5-in22k | 53.11 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b5-in22kpre_upernet_3rdparty_512x512-ade20k_20230522-5bb6f2b4.pth) |
+| UPerNet | VAN-B6-in22k | 54.25 | [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b6-in22kpre_upernet_3rdparty_512x512-ade20k_20230522-e226b363.pth) |
+|   FPN   |    VAN-B0    | 38.65 |   [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b0-in1kpre_fpn_3rdparty_512x512-ade20k_20230522-75a76298.pth)    |
+|   FPN   |    VAN-B1    | 43.22 |   [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b1-in1kpre_fpn_3rdparty_512x512-ade20k_20230522-104499ff.pth)    |
+|   FPN   |    VAN-B2    | 46.84 |   [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b2-in1kpre_fpn_3rdparty_512x512-ade20k_20230522-7074e6f8.pth)    |
+|   FPN   |    VAN-B3    | 48.32 |   [model](https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b3-in1kpre_fpn_3rdparty_512x512-ade20k_20230522-2c3b7f5e.pth)    |
+
+## Preparation
+
+Install MMSegmentation and download ADE20K according to the guidelines in MMSegmentation.
+
+## Requirement
+
+**Step 0.** Install [MMCV](https://github.com/open-mmlab/mmcv) using [MIM](https://github.com/open-mmlab/mim).
+
+```shell
+pip install -U openmim
+mim install mmengine
+mim install "mmcv>=2.0.0"
+```
+
+**Step 1.** Install MMSegmentation.
+
+Case a: If you develop and run mmseg directly, install it from source:
+
+```shell
+git clone -b main https://github.com/open-mmlab/mmsegmentation.git
+cd mmsegmentation
+pip install -v -e .
+```
+
+Case b: If you use mmsegmentation as a dependency or third-party package, install it with pip:
+
+```shell
+pip install "mmsegmentation>=1.0.0"
+```
+
+## Training
+
+If you use 4 GPUs for training by default. Run:
+
+```bash
+bash tools/dist_train.sh projects/van/configs/van/van-b2_pre1k_upernet_4xb2-160k_ade20k-512x512.py 4
+```
+
+## Evaluation
+
+To evaluate the model, an example is:
+
+```bash
+bash tools/dist_train.sh projects/van/configs/van/van-b2_pre1k_upernet_4xb2-160k_ade20k-512x512.py work_dirs/van-b2_pre1k_upernet_4xb2-160k_ade20k-512x512/iter_160000.pth 4 --eval mIoU
+```
+
+## FLOPs
+
+To calculate FLOPs for a model, run:
+
+```bash
+bash tools/analysis_tools/get_flops.py projects/van/configs/van/van-b2_pre1k_upernet_4xb2-160k_ade20k-512x512.py --shape 512 512
+```
+
+## Acknowledgment
+
+Our implementation is mainly based on [mmsegmentation](https://github.com/open-mmlab/mmsegmentation/tree/v0.12.0), [Swin-Transformer](https://github.com/SwinTransformer/Swin-Transformer-Semantic-Segmentation), [PoolFormer](https://github.com/sail-sg/poolformer), [Enjoy-Hamburger](https://github.com/Gsunshine/Enjoy-Hamburger) and [VAN-Segmentation](https://github.com/Visual-Attention-Network/VAN-Segmentation/blob/main/README.md?plain=1). Thanks for their authors.
+
+## LICENSE
+
+This repo is under the Apache-2.0 license. For commercial use, please contact the authors.
diff --git a/projects/van/backbones/__init__.py b/projects/van/backbones/__init__.py
new file mode 100644
index 0000000000..071995de29
--- /dev/null
+++ b/projects/van/backbones/__init__.py
@@ -0,0 +1,4 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from .van import VAN
+
+__all__ = ['VAN']
diff --git a/projects/van/backbones/van.py b/projects/van/backbones/van.py
new file mode 100644
index 0000000000..301834a758
--- /dev/null
+++ b/projects/van/backbones/van.py
@@ -0,0 +1,124 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import warnings
+
+import torch
+import torch.nn as nn
+from mmengine.model import BaseModule
+
+from mmseg.models.backbones.mscan import (MSCAN, MSCABlock,
+                                          MSCASpatialAttention,
+                                          OverlapPatchEmbed)
+from mmseg.registry import MODELS
+
+
+class VANAttentionModule(BaseModule):
+
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv0 = nn.Conv2d(
+            in_channels, in_channels, 5, padding=2, groups=in_channels)
+        self.conv_spatial = nn.Conv2d(
+            in_channels,
+            in_channels,
+            7,
+            stride=1,
+            padding=9,
+            groups=in_channels,
+            dilation=3)
+        self.conv1 = nn.Conv2d(in_channels, in_channels, 1)
+
+    def forward(self, x):
+        u = x.clone()
+        attn = self.conv0(x)
+        attn = self.conv_spatial(attn)
+        attn = self.conv1(attn)
+        return u * attn
+
+
+class VANSpatialAttention(MSCASpatialAttention):
+
+    def __init__(self, in_channels, act_cfg=dict(type='GELU')):
+        super().__init__(in_channels, act_cfg=act_cfg)
+        self.spatial_gating_unit = VANAttentionModule(in_channels)
+
+
+class VANBlock(MSCABlock):
+
+    def __init__(self,
+                 channels,
+                 mlp_ratio=4.,
+                 drop=0.,
+                 drop_path=0.,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True)):
+        super().__init__(
+            channels,
+            mlp_ratio=mlp_ratio,
+            drop=drop,
+            drop_path=drop_path,
+            act_cfg=act_cfg,
+            norm_cfg=norm_cfg)
+        self.attn = VANSpatialAttention(channels)
+
+
+@MODELS.register_module()
+class VAN(MSCAN):
+
+    def __init__(self,
+                 in_channels=3,
+                 embed_dims=[64, 128, 256, 512],
+                 mlp_ratios=[8, 8, 4, 4],
+                 drop_rate=0.,
+                 drop_path_rate=0.,
+                 depths=[3, 4, 6, 3],
+                 num_stages=4,
+                 act_cfg=dict(type='GELU'),
+                 norm_cfg=dict(type='SyncBN', requires_grad=True),
+                 pretrained=None,
+                 init_cfg=None):
+        super(MSCAN, self).__init__(init_cfg=init_cfg)
+
+        assert not (init_cfg and pretrained), \
+            'init_cfg and pretrained cannot be set at the same time'
+        if isinstance(pretrained, str):
+            warnings.warn('DeprecationWarning: pretrained is deprecated, '
+                          'please use "init_cfg" instead')
+            self.init_cfg = dict(type='Pretrained', checkpoint=pretrained)
+        elif pretrained is not None:
+            raise TypeError('pretrained must be a str or None')
+
+        self.depths = depths
+        self.num_stages = num_stages
+
+        # stochastic depth decay rule
+        dpr = [
+            x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))
+        ]
+        cur = 0
+
+        for i in range(num_stages):
+            patch_embed = OverlapPatchEmbed(
+                patch_size=7 if i == 0 else 3,
+                stride=4 if i == 0 else 2,
+                in_channels=in_channels if i == 0 else embed_dims[i - 1],
+                embed_dim=embed_dims[i],
+                norm_cfg=norm_cfg)
+
+            block = nn.ModuleList([
+                VANBlock(
+                    channels=embed_dims[i],
+                    mlp_ratio=mlp_ratios[i],
+                    drop=drop_rate,
+                    drop_path=dpr[cur + j],
+                    act_cfg=act_cfg,
+                    norm_cfg=norm_cfg) for j in range(depths[i])
+            ])
+            norm = nn.LayerNorm(embed_dims[i])
+            cur += depths[i]
+
+            setattr(self, f'patch_embed{i + 1}', patch_embed)
+            setattr(self, f'block{i + 1}', block)
+            setattr(self, f'norm{i + 1}', norm)
+
+    def init_weights(self):
+        return super().init_weights()
diff --git a/projects/van/configs/_base_/datasets/ade20k.py b/projects/van/configs/_base_/datasets/ade20k.py
new file mode 100644
index 0000000000..69b3c2a73b
--- /dev/null
+++ b/projects/van/configs/_base_/datasets/ade20k.py
@@ -0,0 +1,14 @@
+# dataset settings
+_base_ = '../../../../../configs/_base_/datasets/ade20k.py'
+
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='Resize', scale=(2048, 512), keep_ratio=True),
+    dict(type='ResizeToMultiple', size_divisor=32),
+    # add loading annotation after ``Resize`` because ground truth
+    # does not need to do resize data transform
+    dict(type='LoadAnnotations', reduce_zero_label=True),
+    dict(type='PackSegInputs')
+]
+val_dataloader = dict(dataset=dict(pipeline=test_pipeline))
+test_dataloader = val_dataloader
diff --git a/projects/van/configs/_base_/models/van_fpn.py b/projects/van/configs/_base_/models/van_fpn.py
new file mode 100644
index 0000000000..c7fd7391f7
--- /dev/null
+++ b/projects/van/configs/_base_/models/van_fpn.py
@@ -0,0 +1,43 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=(512, 512))
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='VAN',
+        embed_dims=[32, 64, 160, 256],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        depths=[3, 3, 5, 2],
+        act_cfg=dict(type='GELU'),
+        norm_cfg=norm_cfg,
+        init_cfg=dict()),
+    neck=dict(
+        type='FPN',
+        in_channels=[32, 64, 160, 256],
+        out_channels=256,
+        num_outs=4),
+    decode_head=dict(
+        type='FPNHead',
+        in_channels=[256, 256, 256, 256],
+        in_index=[0, 1, 2, 3],
+        feature_strides=[4, 8, 16, 32],
+        channels=128,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/projects/van/configs/_base_/models/van_upernet.py b/projects/van/configs/_base_/models/van_upernet.py
new file mode 100644
index 0000000000..8f94c0d9d8
--- /dev/null
+++ b/projects/van/configs/_base_/models/van_upernet.py
@@ -0,0 +1,51 @@
+# model settings
+norm_cfg = dict(type='SyncBN', requires_grad=True)
+
+data_preprocessor = dict(
+    type='SegDataPreProcessor',
+    mean=[123.675, 116.28, 103.53],
+    std=[58.395, 57.12, 57.375],
+    bgr_to_rgb=True,
+    pad_val=0,
+    seg_pad_val=255,
+    size=(512, 512))
+model = dict(
+    type='EncoderDecoder',
+    data_preprocessor=data_preprocessor,
+    backbone=dict(
+        type='VAN',
+        embed_dims=[32, 64, 160, 256],
+        drop_rate=0.0,
+        drop_path_rate=0.1,
+        depths=[3, 3, 5, 2],
+        act_cfg=dict(type='GELU'),
+        norm_cfg=norm_cfg,
+        init_cfg=dict()),
+    decode_head=dict(
+        type='UPerHead',
+        in_channels=[32, 64, 160, 256],
+        in_index=[0, 1, 2, 3],
+        pool_scales=(1, 2, 3, 6),
+        channels=512,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
+    auxiliary_head=dict(
+        type='FCNHead',
+        in_channels=160,
+        in_index=2,
+        channels=256,
+        num_convs=1,
+        concat_input=False,
+        dropout_ratio=0.1,
+        num_classes=150,
+        norm_cfg=norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
+    # model training and testing settings
+    train_cfg=dict(),
+    test_cfg=dict(mode='whole'))
diff --git a/projects/van/configs/van/van-b0_fpn_8xb4-40k_ade20k-512x512.py b/projects/van/configs/van/van-b0_fpn_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000..2faf3788a7
--- /dev/null
+++ b/projects/van/configs/van/van-b0_fpn_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = './van-b2_fpn_8xb4-40k_ade20k-512x512.py'
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b0_3rdparty_20230522-956f5e0d.pth'  # noqa
+model = dict(
+    backbone=dict(
+        embed_dims=[32, 64, 160, 256],
+        depths=[3, 3, 5, 2],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path)),
+    neck=dict(in_channels=[32, 64, 160, 256]))
diff --git a/projects/van/configs/van/van-b1_fpn_8xb4-40k_ade20k-512x512.py b/projects/van/configs/van/van-b1_fpn_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000..cf64a7138b
--- /dev/null
+++ b/projects/van/configs/van/van-b1_fpn_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,6 @@
+_base_ = './van-b2_fpn_8xb4-40k_ade20k-512x512.py'
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b1_3rdparty_20230522-3adb117f.pth'  # noqa
+model = dict(
+    backbone=dict(
+        depths=[2, 2, 4, 2],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path)))
diff --git a/projects/van/configs/van/van-b2_fpn_8xb4-40k_ade20k-512x512.py b/projects/van/configs/van/van-b2_fpn_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000..965fa1cd36
--- /dev/null
+++ b/projects/van/configs/van/van-b2_fpn_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,53 @@
+_base_ = [
+    '../_base_/models/van_fpn.py',
+    '../_base_/datasets/ade20k.py',
+    '../../../../configs/_base_/default_runtime.py',
+]
+custom_imports = dict(imports=['projects.van.backbones'])
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b2_3rdparty_20230522-636fac93.pth'  # noqa
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[3, 3, 12, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path),
+        drop_path_rate=0.2),
+    neck=dict(in_channels=[64, 128, 320, 512]),
+    decode_head=dict(num_classes=150))
+
+train_dataloader = dict(batch_size=4)
+
+# we use 8 gpu instead of 4 in mmsegmentation, so lr*2 and max_iters/2
+gpu_multiples = 2
+max_iters = 80000 // gpu_multiples
+interval = 8000 // gpu_multiples
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW',
+        lr=0.0001 * gpu_multiples,
+        # betas=(0.9, 0.999),
+        weight_decay=0.0001),
+    clip_grad=None)
+# learning policy
+param_scheduler = [
+    dict(
+        type='PolyLR',
+        power=0.9,
+        eta_min=0.0,
+        begin=0,
+        end=max_iters,
+        by_epoch=False,
+    )
+]
+train_cfg = dict(
+    type='IterBasedTrainLoop', max_iters=max_iters, val_interval=interval)
+val_cfg = dict(type='ValLoop')
+test_cfg = dict(type='TestLoop')
+default_hooks = dict(
+    timer=dict(type='IterTimerHook'),
+    logger=dict(type='LoggerHook', interval=50, log_metric_by_epoch=False),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    checkpoint=dict(type='CheckpointHook', by_epoch=False, interval=interval),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    visualization=dict(type='SegVisualizationHook'))
diff --git a/projects/van/configs/van/van-b2_upernet_4xb2-160k_ade20k-512x512.py b/projects/van/configs/van/van-b2_upernet_4xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..c529606a20
--- /dev/null
+++ b/projects/van/configs/van/van-b2_upernet_4xb2-160k_ade20k-512x512.py
@@ -0,0 +1,46 @@
+_base_ = [
+    '../_base_/models/van_upernet.py', '../_base_/datasets/ade20k.py',
+    '../../../../configs/_base_/default_runtime.py',
+    '../../../../configs/_base_/schedules/schedule_160k.py'
+]
+custom_imports = dict(imports=['projects.van.backbones'])
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b2_3rdparty_20230522-636fac93.pth'  # noqa
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[3, 3, 12, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path)),
+    decode_head=dict(in_channels=[64, 128, 320, 512], num_classes=150),
+    auxiliary_head=dict(in_channels=320, num_classes=150))
+
+# AdamW optimizer
+# no weight decay for position embedding & layer norm in backbone
+optim_wrapper = dict(
+    _delete_=True,
+    type='OptimWrapper',
+    optimizer=dict(
+        type='AdamW', lr=0.00006, betas=(0.9, 0.999), weight_decay=0.01),
+    clip_grad=None,
+    paramwise_cfg=dict(
+        custom_keys={
+            'absolute_pos_embed': dict(decay_mult=0.),
+            'relative_position_bias_table': dict(decay_mult=0.),
+            'norm': dict(decay_mult=0.)
+        }))
+# learning policy
+param_scheduler = [
+    dict(
+        type='LinearLR', start_factor=1e-6, by_epoch=False, begin=0, end=1500),
+    dict(
+        type='PolyLR',
+        power=1.0,
+        begin=1500,
+        end=_base_.train_cfg.max_iters,
+        eta_min=0.0,
+        by_epoch=False,
+    )
+]
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=2)
diff --git a/projects/van/configs/van/van-b3_fpn_8xb4-40k_ade20k-512x512.py b/projects/van/configs/van/van-b3_fpn_8xb4-40k_ade20k-512x512.py
new file mode 100644
index 0000000000..b0493fe4f9
--- /dev/null
+++ b/projects/van/configs/van/van-b3_fpn_8xb4-40k_ade20k-512x512.py
@@ -0,0 +1,11 @@
+_base_ = './van-b2_fpn_8xb4-40k_ade20k-512x512.py'
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b3_3rdparty_20230522-a184e051.pth'  # noqa
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        embed_dims=[64, 128, 320, 512],
+        depths=[3, 5, 27, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path),
+        drop_path_rate=0.3),
+    neck=dict(in_channels=[64, 128, 320, 512]))
+train_dataloader = dict(batch_size=4)
diff --git a/projects/van/configs/van/van-b3_upernet_4xb2-160k_ade20k-512x512.py b/projects/van/configs/van/van-b3_upernet_4xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..8201801d99
--- /dev/null
+++ b/projects/van/configs/van/van-b3_upernet_4xb2-160k_ade20k-512x512.py
@@ -0,0 +1,8 @@
+_base_ = './van-b2_upernet_4xb2-160k_ade20k-512x512.py'
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b3_3rdparty_20230522-a184e051.pth'  # noqa
+model = dict(
+    type='EncoderDecoder',
+    backbone=dict(
+        depths=[3, 5, 27, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path),
+        drop_path_rate=0.3))
diff --git a/projects/van/configs/van/van-b4-in22kpre_upernet_4xb4-160k_ade20k-512x512.py b/projects/van/configs/van/van-b4-in22kpre_upernet_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..15c8f7ca6e
--- /dev/null
+++ b/projects/van/configs/van/van-b4-in22kpre_upernet_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './van-b2_upernet_4xb2-160k_ade20k-512x512.py'
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b4-in22k_3rdparty_20230522-5e31cafb.pth'  # noqa
+model = dict(
+    backbone=dict(
+        depths=[3, 6, 40, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path),
+        drop_path_rate=0.4))
+
+# By default, models are trained on 8 GPUs with 2 images per GPU
+train_dataloader = dict(batch_size=4)
diff --git a/projects/van/configs/van/van-b4_upernet_4xb4-160k_ade20k-512x512.py b/projects/van/configs/van/van-b4_upernet_4xb4-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..33ae049d0c
--- /dev/null
+++ b/projects/van/configs/van/van-b4_upernet_4xb4-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './van-b2_upernet_4xb2-160k_ade20k-512x512.py'
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b4_3rdparty_20230522-1d71c077.pth'  # noqa
+model = dict(
+    backbone=dict(
+        depths=[3, 6, 40, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path),
+        drop_path_rate=0.4))
+
+# By default, models are trained on 4 GPUs with 4 images per GPU
+train_dataloader = dict(batch_size=4)
diff --git a/projects/van/configs/van/van-b5-in22kpre_upernet_4xb2-160k_ade20k-512x512.py b/projects/van/configs/van/van-b5-in22kpre_upernet_4xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..f36c6242bd
--- /dev/null
+++ b/projects/van/configs/van/van-b5-in22kpre_upernet_4xb2-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './van-b2_upernet_4xb2-160k_ade20k-512x512.py'
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b5-in22k_3rdparty_20230522-b26134d7.pth'  # noqa
+model = dict(
+    backbone=dict(
+        embed_dims=[96, 192, 480, 768],
+        depths=[3, 3, 24, 3],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path),
+        drop_path_rate=0.4),
+    decode_head=dict(in_channels=[96, 192, 480, 768], num_classes=150),
+    auxiliary_head=dict(in_channels=480, num_classes=150))
diff --git a/projects/van/configs/van/van-b6-in22kpre_upernet_4xb2-160k_ade20k-512x512.py b/projects/van/configs/van/van-b6-in22kpre_upernet_4xb2-160k_ade20k-512x512.py
new file mode 100644
index 0000000000..aa529efed8
--- /dev/null
+++ b/projects/van/configs/van/van-b6-in22kpre_upernet_4xb2-160k_ade20k-512x512.py
@@ -0,0 +1,10 @@
+_base_ = './van-b2_upernet_4xb2-160k_ade20k-512x512.py'
+ckpt_path = 'https://download.openmmlab.com/mmsegmentation/v0.5/van_3rdparty/van-b6-in22k_3rdparty_20230522-5e5172a3.pth'  # noqa
+model = dict(
+    backbone=dict(
+        embed_dims=[96, 192, 384, 768],
+        depths=[6, 6, 90, 6],
+        init_cfg=dict(type='Pretrained', checkpoint=ckpt_path),
+        drop_path_rate=0.5),
+    decode_head=dict(in_channels=[96, 192, 384, 768], num_classes=150),
+    auxiliary_head=dict(in_channels=384, num_classes=150))
diff --git a/requirements.txt b/requirements.txt
index 6da5adea75..501bddc884 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,4 @@
 -r requirements/optional.txt
 -r requirements/runtime.txt
 -r requirements/tests.txt
+-r requirements/multimodal.txt
diff --git a/requirements/albu.txt b/requirements/albu.txt
new file mode 100644
index 0000000000..f421fbbdc4
--- /dev/null
+++ b/requirements/albu.txt
@@ -0,0 +1 @@
+albumentations>=0.3.2 --no-binary qudida,albumentations
diff --git a/requirements/docs.txt b/requirements/docs.txt
index 20170845c4..19632d36ab 100644
--- a/requirements/docs.txt
+++ b/requirements/docs.txt
@@ -1,6 +1,7 @@
 docutils==0.16.0
 myst-parser
--e git+https://github.com/gaotongxiao/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
+-e git+https://github.com/open-mmlab/pytorch_sphinx_theme.git#egg=pytorch_sphinx_theme
 sphinx==4.0.2
 sphinx_copybutton
 sphinx_markdown_tables
+urllib3<2.0.0
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
index bd43faf87e..5732d345bb 100644
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@@ -1,2 +1,2 @@
-mmcls>=0.20.1
-mmcv-full>=1.4.4,<=1.6.0
+mmcv>=2.0.0rc4,<2.2.0
+mmengine>=0.5.0,<1.0.0
diff --git a/requirements/multimodal.txt b/requirements/multimodal.txt
new file mode 100644
index 0000000000..2195d0d9ef
--- /dev/null
+++ b/requirements/multimodal.txt
@@ -0,0 +1,2 @@
+ftfy
+regex
diff --git a/requirements/optional.txt b/requirements/optional.txt
index 47fa593315..b0310f5296 100644
--- a/requirements/optional.txt
+++ b/requirements/optional.txt
@@ -1 +1,22 @@
 cityscapesscripts
+-e git+https://github.com/openai/CLIP.git@main#egg=clip
+
+# for vpd model
+diffusers
+einops==0.3.0
+imageio==2.9.0
+imageio-ffmpeg==0.4.2
+invisible-watermark
+kornia==0.6
+-e git+https://github.com/CompVis/stable-diffusion@21f890f#egg=latent-diffusion
+nibabel
+omegaconf==2.1.1
+pudb==2019.2
+pytorch-lightning==1.4.2
+streamlit>=0.73.1
+-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
+test-tube>=0.7.5
+timm
+torch-fidelity==0.3.0
+torchmetrics==0.6.0
+transformers==4.19.2
diff --git a/requirements/readthedocs.txt b/requirements/readthedocs.txt
index 22a894bd71..9627504884 100644
--- a/requirements/readthedocs.txt
+++ b/requirements/readthedocs.txt
@@ -1,4 +1,6 @@
-mmcv
+mmcv>=2.0.0rc1,<2.1.0
+mmengine>=0.4.0,<1.0.0
 prettytable
+scipy
 torch
 torchvision
diff --git a/requirements/runtime.txt b/requirements/runtime.txt
index 520408fe8b..3e242581e9 100644
--- a/requirements/runtime.txt
+++ b/requirements/runtime.txt
@@ -1,5 +1,5 @@
 matplotlib
-mmcls>=0.20.1
 numpy
 packaging
 prettytable
+scipy
diff --git a/requirements/tests.txt b/requirements/tests.txt
index 74fc76146d..3fff2520d7 100644
--- a/requirements/tests.txt
+++ b/requirements/tests.txt
@@ -1,6 +1,8 @@
 codecov
 flake8
+ftfy
 interrogate
 pytest
+regex
 xdoctest>=0.10.0
 yapf
diff --git a/resources/cascade_encoder_decoder_dataflow.png b/resources/cascade_encoder_decoder_dataflow.png
new file mode 100644
index 0000000000..28e33d0527
Binary files /dev/null and b/resources/cascade_encoder_decoder_dataflow.png differ
diff --git a/resources/encoder_decoder_dataflow.png b/resources/encoder_decoder_dataflow.png
new file mode 100644
index 0000000000..33a8a49163
Binary files /dev/null and b/resources/encoder_decoder_dataflow.png differ
diff --git a/resources/miaomiao_qrcode.jpg b/resources/miaomiao_qrcode.jpg
new file mode 100644
index 0000000000..d34cbae6fd
Binary files /dev/null and b/resources/miaomiao_qrcode.jpg differ
diff --git a/resources/test_step.png b/resources/test_step.png
new file mode 100644
index 0000000000..4d52351b85
Binary files /dev/null and b/resources/test_step.png differ
diff --git a/resources/train_step.png b/resources/train_step.png
new file mode 100644
index 0000000000..1e06105a06
Binary files /dev/null and b/resources/train_step.png differ
diff --git a/setup.cfg b/setup.cfg
index 23cb09e698..2ea07600c0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -16,4 +16,4 @@ default_section = THIRDPARTY
 skip = *.po,*.ts,*.ipynb
 count =
 quiet-level = 3
-ignore-words-list = formating,sur,hist,dota
+ignore-words-list = formating,sur,hist,dota,warmup,damon
diff --git a/setup.py b/setup.py
index 91afefb6ed..45d923db60 100755
--- a/setup.py
+++ b/setup.py
@@ -18,7 +18,7 @@ def readme():
 
 
 def get_version():
-    with open(version_file, 'r') as f:
+    with open(version_file) as f:
         exec(compile(f.read(), version_file, 'exec'))
     return locals()['__version__']
 
@@ -74,12 +74,11 @@ def parse_line(line):
             yield info
 
     def parse_require_file(fpath):
-        with open(fpath, 'r') as f:
+        with open(fpath) as f:
             for line in f.readlines():
                 line = line.strip()
                 if line and not line.startswith('#'):
-                    for info in parse_line(line):
-                        yield info
+                    yield from parse_line(line)
 
     def gen_packages_items():
         if exists(require_fpath):
@@ -124,7 +123,7 @@ def add_mim_extension():
     else:
         return
 
-    filenames = ['tools', 'configs', 'model-index.yml']
+    filenames = ['tools', 'configs', 'model-index.yml', 'dataset-index.yml']
     repo_path = osp.dirname(__file__)
     mim_path = osp.join(repo_path, 'mmseg', '.mim')
     os.makedirs(mim_path, exist_ok=True)
@@ -176,7 +175,7 @@ def add_mim_extension():
         author='MMSegmentation Contributors',
         author_email='openmmlab@gmail.com',
         keywords='computer vision, semantic segmentation',
-        url='http://github.com/open-mmlab/mmsegmentation',
+        url='https://github.com/open-mmlab/mmsegmentation',
         packages=find_packages(exclude=('configs', 'tools', 'demo')),
         include_package_data=True,
         classifiers=[
@@ -193,8 +192,9 @@ def add_mim_extension():
         extras_require={
             'all': parse_requirements('requirements.txt'),
             'tests': parse_requirements('requirements/tests.txt'),
-            'build': parse_requirements('requirements/build.txt'),
             'optional': parse_requirements('requirements/optional.txt'),
+            'mim': parse_requirements('requirements/mminstall.txt'),
+            'multimodal': parse_requirements('requirements/multimodal.txt'),
         },
         ext_modules=[],
         zip_safe=False)
diff --git a/tests/data/biomedical.nii.gz b/tests/data/biomedical.nii.gz
new file mode 100755
index 0000000000..32f3276d9e
Binary files /dev/null and b/tests/data/biomedical.nii.gz differ
diff --git a/tests/data/biomedical.npy b/tests/data/biomedical.npy
new file mode 100644
index 0000000000..481944493d
Binary files /dev/null and b/tests/data/biomedical.npy differ
diff --git a/tests/data/biomedical.pkl b/tests/data/biomedical.pkl
new file mode 100644
index 0000000000..48c32a7cef
Binary files /dev/null and b/tests/data/biomedical.pkl differ
diff --git a/tests/data/biomedical_ann.nii.gz b/tests/data/biomedical_ann.nii.gz
new file mode 100755
index 0000000000..5eae8a4a49
Binary files /dev/null and b/tests/data/biomedical_ann.nii.gz differ
diff --git a/tests/data/dataset.json b/tests/data/dataset.json
new file mode 100755
index 0000000000..09b01235ec
--- /dev/null
+++ b/tests/data/dataset.json
@@ -0,0 +1,30 @@
+{
+	"name": "BRATS",
+	"description": "Gliomas segmentation tumour and oedema in on brain images",
+	"tensorImageSize": "4D",
+	"modality": {
+		"0": "FLAIR",
+		"1": "T1w",
+		"2": "t1gd",
+		"3": "T2w"
+	},
+	"labels": {
+		"0": "background",
+		"1": "edema",
+		"2": "non-enhancing tumor",
+		"3": "enhancing tumour"
+	},
+	"numTraining": 484,
+	"numTest": 266,
+	"training": [
+		{
+			"image": "./imagesTr/BRATS_457.nii.gz",
+			"label": "./labelsTr/BRATS_457.nii.gz"
+		}
+	],
+	"test": [
+		"./imagesTs/BRATS_568.nii.gz",
+		"./imagesTs/BRATS_515.nii.gz",
+		"./imagesTs/BRATS_576.nii.gz"
+	]
+}
diff --git a/tests/data/dsdl_seg/config.py b/tests/data/dsdl_seg/config.py
new file mode 100755
index 0000000000..8eed751c2f
--- /dev/null
+++ b/tests/data/dsdl_seg/config.py
@@ -0,0 +1,13 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+local = dict(
+    type='LocalFileReader',
+    working_dir='/nvme/share_data/VOC2012',
+)
+
+ali_oss = dict(
+    type='AliOSSFileReader',
+    access_key_secret='your secret key of aliyun oss',
+    endpoint='your endpoint of aliyun oss',
+    access_key_id='your access key of aliyun oss',
+    bucket_name='your bucket name of aliyun oss',
+    working_dir='the relative path of your media dir in the bucket')
diff --git a/tests/data/dsdl_seg/defs/class-dom.yaml b/tests/data/dsdl_seg/defs/class-dom.yaml
new file mode 100755
index 0000000000..e5dd598c4a
--- /dev/null
+++ b/tests/data/dsdl_seg/defs/class-dom.yaml
@@ -0,0 +1,24 @@
+$dsdl-version: "0.5.0"
+VOCClassDom:
+    $def: class_domain
+    classes:
+        - aeroplane
+        - bicycle
+        - bird
+        - boat
+        - bottle
+        - bus
+        - car
+        - cat
+        - chair
+        - cow
+        - diningtable
+        - dog
+        - horse
+        - motorbike
+        - person
+        - pottedplant
+        - sheep
+        - sofa
+        - train
+        - tvmonitor
diff --git a/tests/data/dsdl_seg/defs/segmentation-def.yaml b/tests/data/dsdl_seg/defs/segmentation-def.yaml
new file mode 100755
index 0000000000..057139ed57
--- /dev/null
+++ b/tests/data/dsdl_seg/defs/segmentation-def.yaml
@@ -0,0 +1,15 @@
+$dsdl-version: "0.5.0"
+
+ImageMedia:
+    $def: struct
+    $fields:
+        image: Image
+        image_shape: ImageShape
+
+SegmentationSample:
+    $def: struct
+    $params: ['cdom']
+    $fields:
+        media: ImageMedia
+        label_map: LabelMap[dom=$cdom]
+        instance_map: InstanceMap
diff --git a/tests/data/dsdl_seg/set-train/train.yaml b/tests/data/dsdl_seg/set-train/train.yaml
new file mode 100755
index 0000000000..69872445a5
--- /dev/null
+++ b/tests/data/dsdl_seg/set-train/train.yaml
@@ -0,0 +1,15 @@
+$dsdl-version: "0.5.0"
+$import:
+    - ../defs/segmentation-def
+    - ../defs/class-dom
+meta:
+    dataset_name: "VOC2012"
+    sub_dataset_name: "train"
+    task_type: "Segmentation"
+    dataset_homepage: "http://host.robots.ox.ac.uk/pascal/VOC/voc2012/index.html"
+    dataset_publisher: "University of Leeds | ETHZ, Zurich | University of Edinburgh\
+        \ |Microsoft Research Cambridge | University of Oxford"
+    OpenDataLab_address: "https://opendatalab.com/PASCAL_VOC2012/download"
+data:
+    sample-type: SegmentationSample[cdom=VOCClassDom]
+    sample-path: train_samples.json
diff --git a/tests/data/dsdl_seg/set-train/train_samples.json b/tests/data/dsdl_seg/set-train/train_samples.json
new file mode 100755
index 0000000000..559f584572
--- /dev/null
+++ b/tests/data/dsdl_seg/set-train/train_samples.json
@@ -0,0 +1 @@
+{"samples": [{"media": {"image": "JPEGImages/2007_000032.jpg", "image_shape": [281, 500]}, "label_map": "SegmentationClass/2007_000032.png", "instance_map": "SegmentationObject/2007_000032.png"}, {"media": {"image": "JPEGImages/2007_000039.jpg", "image_shape": [375, 500]}, "label_map": "SegmentationClass/2007_000039.png", "instance_map": "SegmentationObject/2007_000039.png"}, {"media": {"image": "JPEGImages/2007_000063.jpg", "image_shape": [375, 500]}, "label_map": "SegmentationClass/2007_000063.png", "instance_map": "SegmentationObject/2007_000063.png"}]}
diff --git a/tests/data/pseudo_bdd100k_dataset/images/10k/train/0004a4c0-d4dff0ad.jpg b/tests/data/pseudo_bdd100k_dataset/images/10k/train/0004a4c0-d4dff0ad.jpg
new file mode 100644
index 0000000000..4724a3d930
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/images/10k/train/0004a4c0-d4dff0ad.jpg differ
diff --git a/tests/data/pseudo_bdd100k_dataset/images/10k/train/00054602-3bf57337.jpg b/tests/data/pseudo_bdd100k_dataset/images/10k/train/00054602-3bf57337.jpg
new file mode 100644
index 0000000000..5efe06b99a
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/images/10k/train/00054602-3bf57337.jpg differ
diff --git a/tests/data/pseudo_bdd100k_dataset/images/10k/train/00067cfb-e535423e.jpg b/tests/data/pseudo_bdd100k_dataset/images/10k/train/00067cfb-e535423e.jpg
new file mode 100644
index 0000000000..2233c03c76
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/images/10k/train/00067cfb-e535423e.jpg differ
diff --git a/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d06fefd-f7be05a6.jpg b/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d06fefd-f7be05a6.jpg
new file mode 100644
index 0000000000..535087b5a9
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d06fefd-f7be05a6.jpg differ
diff --git a/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d128593-0ccfea4c.jpg b/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d128593-0ccfea4c.jpg
new file mode 100644
index 0000000000..7f2971afde
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d128593-0ccfea4c.jpg differ
diff --git a/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d15b18b-1e0d6e3f.jpg b/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d15b18b-1e0d6e3f.jpg
new file mode 100644
index 0000000000..31a951d483
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/images/10k/val/7d15b18b-1e0d6e3f.jpg differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/0004a4c0-d4dff0ad.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/0004a4c0-d4dff0ad.png
new file mode 100644
index 0000000000..086a8d5064
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/0004a4c0-d4dff0ad.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/00054602-3bf57337.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/00054602-3bf57337.png
new file mode 100644
index 0000000000..43338c283c
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/00054602-3bf57337.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/00067cfb-e535423e.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/00067cfb-e535423e.png
new file mode 100644
index 0000000000..7c0ad1d5d9
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/train/00067cfb-e535423e.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d128593-0ccfea4c.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d128593-0ccfea4c.png
new file mode 100644
index 0000000000..43338c283c
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d128593-0ccfea4c.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d15b18b-1e0d6e3f.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d15b18b-1e0d6e3f.png
new file mode 100644
index 0000000000..43338c283c
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d15b18b-1e0d6e3f.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d2f7975-e0c1c5a7.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d2f7975-e0c1c5a7.png
new file mode 100644
index 0000000000..7c0ad1d5d9
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/colormaps/val/7d2f7975-e0c1c5a7.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/0004a4c0-d4dff0ad.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/0004a4c0-d4dff0ad.png
new file mode 100644
index 0000000000..5c6bf5e158
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/0004a4c0-d4dff0ad.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/00054602-3bf57337.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/00054602-3bf57337.png
new file mode 100644
index 0000000000..c525a76888
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/00054602-3bf57337.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/00067cfb-e535423e.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/00067cfb-e535423e.png
new file mode 100644
index 0000000000..7dfd3af4e3
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/train/00067cfb-e535423e.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d06fefd-f7be05a6.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d06fefd-f7be05a6.png
new file mode 100644
index 0000000000..7dfd3af4e3
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d06fefd-f7be05a6.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d128593-0ccfea4c.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d128593-0ccfea4c.png
new file mode 100644
index 0000000000..c525a76888
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d128593-0ccfea4c.png differ
diff --git a/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d15b18b-1e0d6e3f.png b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d15b18b-1e0d6e3f.png
new file mode 100644
index 0000000000..c525a76888
Binary files /dev/null and b/tests/data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val/7d15b18b-1e0d6e3f.png differ
diff --git a/tests/data/pseudo_lip_dataset/train_images/684_2150041.jpg b/tests/data/pseudo_lip_dataset/train_images/684_2150041.jpg
new file mode 100644
index 0000000000..d6ac13a992
Binary files /dev/null and b/tests/data/pseudo_lip_dataset/train_images/684_2150041.jpg differ
diff --git a/tests/data/pseudo_lip_dataset/train_segmentations/684_2150041.png b/tests/data/pseudo_lip_dataset/train_segmentations/684_2150041.png
new file mode 100644
index 0000000000..47271e2cab
Binary files /dev/null and b/tests/data/pseudo_lip_dataset/train_segmentations/684_2150041.png differ
diff --git a/tests/data/pseudo_lip_dataset/val_images/86_185913.jpg b/tests/data/pseudo_lip_dataset/val_images/86_185913.jpg
new file mode 100644
index 0000000000..7f66845a7b
Binary files /dev/null and b/tests/data/pseudo_lip_dataset/val_images/86_185913.jpg differ
diff --git a/tests/data/pseudo_lip_dataset/val_segmentations/86_185913.png b/tests/data/pseudo_lip_dataset/val_segmentations/86_185913.png
new file mode 100644
index 0000000000..0708e53902
Binary files /dev/null and b/tests/data/pseudo_lip_dataset/val_segmentations/86_185913.png differ
diff --git a/tests/data/pseudo_mapillary_dataset/images/__CRyFzoDOXn6unQ6a3DnQ.jpg b/tests/data/pseudo_mapillary_dataset/images/__CRyFzoDOXn6unQ6a3DnQ.jpg
new file mode 100644
index 0000000000..c3cf31a170
Binary files /dev/null and b/tests/data/pseudo_mapillary_dataset/images/__CRyFzoDOXn6unQ6a3DnQ.jpg differ
diff --git a/tests/data/pseudo_mapillary_dataset/v1.2/__CRyFzoDOXn6unQ6a3DnQ.png b/tests/data/pseudo_mapillary_dataset/v1.2/__CRyFzoDOXn6unQ6a3DnQ.png
new file mode 100644
index 0000000000..2c648b7ef8
Binary files /dev/null and b/tests/data/pseudo_mapillary_dataset/v1.2/__CRyFzoDOXn6unQ6a3DnQ.png differ
diff --git a/tests/data/pseudo_mapillary_dataset/v2.0/__CRyFzoDOXn6unQ6a3DnQ.png b/tests/data/pseudo_mapillary_dataset/v2.0/__CRyFzoDOXn6unQ6a3DnQ.png
new file mode 100644
index 0000000000..809256d931
Binary files /dev/null and b/tests/data/pseudo_mapillary_dataset/v2.0/__CRyFzoDOXn6unQ6a3DnQ.png differ
diff --git a/tests/data/pseudo_nyu_dataset/annotations/bookstore_0001d_00001.png b/tests/data/pseudo_nyu_dataset/annotations/bookstore_0001d_00001.png
new file mode 100644
index 0000000000..77e343603a
Binary files /dev/null and b/tests/data/pseudo_nyu_dataset/annotations/bookstore_0001d_00001.png differ
diff --git a/tests/data/pseudo_nyu_dataset/images/bookstore_0001d_00001.jpg b/tests/data/pseudo_nyu_dataset/images/bookstore_0001d_00001.jpg
new file mode 100644
index 0000000000..7892ed47e7
Binary files /dev/null and b/tests/data/pseudo_nyu_dataset/images/bookstore_0001d_00001.jpg differ
diff --git a/tests/data/pseudo_refuge_dataset/ann_dir/pseudo_g0001.png b/tests/data/pseudo_refuge_dataset/ann_dir/pseudo_g0001.png
new file mode 100644
index 0000000000..4e69365a9c
Binary files /dev/null and b/tests/data/pseudo_refuge_dataset/ann_dir/pseudo_g0001.png differ
diff --git a/tests/data/pseudo_refuge_dataset/img_dir/pseudo_g0001.png b/tests/data/pseudo_refuge_dataset/img_dir/pseudo_g0001.png
new file mode 100644
index 0000000000..e424c3cd21
Binary files /dev/null and b/tests/data/pseudo_refuge_dataset/img_dir/pseudo_g0001.png differ
diff --git a/tests/data/pseudo_synapse_dataset/ann_dir/case0005_slice000.png b/tests/data/pseudo_synapse_dataset/ann_dir/case0005_slice000.png
new file mode 100644
index 0000000000..a22059b58e
Binary files /dev/null and b/tests/data/pseudo_synapse_dataset/ann_dir/case0005_slice000.png differ
diff --git a/tests/data/pseudo_synapse_dataset/ann_dir/case0005_slice001.png b/tests/data/pseudo_synapse_dataset/ann_dir/case0005_slice001.png
new file mode 100644
index 0000000000..a22059b58e
Binary files /dev/null and b/tests/data/pseudo_synapse_dataset/ann_dir/case0005_slice001.png differ
diff --git a/tests/data/pseudo_synapse_dataset/img_dir/case0005_slice000.jpg b/tests/data/pseudo_synapse_dataset/img_dir/case0005_slice000.jpg
new file mode 100644
index 0000000000..51609926b4
Binary files /dev/null and b/tests/data/pseudo_synapse_dataset/img_dir/case0005_slice000.jpg differ
diff --git a/tests/data/pseudo_synapse_dataset/img_dir/case0005_slice001.jpg b/tests/data/pseudo_synapse_dataset/img_dir/case0005_slice001.jpg
new file mode 100644
index 0000000000..e285b8c7f0
Binary files /dev/null and b/tests/data/pseudo_synapse_dataset/img_dir/case0005_slice001.jpg differ
diff --git a/tests/test_apis/test_inferencer.py b/tests/test_apis/test_inferencer.py
new file mode 100644
index 0000000000..d8dbce8f38
--- /dev/null
+++ b/tests/test_apis/test_inferencer.py
@@ -0,0 +1,60 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+
+import numpy as np
+import torch
+from mmengine import ConfigDict
+from utils import *  # noqa: F401, F403
+
+from mmseg.apis import MMSegInferencer
+from mmseg.registry import MODELS
+from mmseg.utils import register_all_modules
+
+
+def test_inferencer():
+    register_all_modules()
+
+    visualizer = dict(
+        type='SegLocalVisualizer',
+        vis_backends=[dict(type='LocalVisBackend')],
+        name='visualizer')
+
+    cfg_dict = dict(
+        model=dict(
+            type='InferExampleModel',
+            data_preprocessor=dict(type='SegDataPreProcessor'),
+            backbone=dict(type='InferExampleBackbone'),
+            decode_head=dict(type='InferExampleHead'),
+            test_cfg=dict(mode='whole')),
+        visualizer=visualizer,
+        test_dataloader=dict(
+            dataset=dict(
+                type='ExampleDataset',
+                pipeline=[
+                    dict(type='LoadImageFromFile'),
+                    dict(type='LoadAnnotations'),
+                    dict(type='PackSegInputs')
+                ]), ))
+    cfg = ConfigDict(cfg_dict)
+    model = MODELS.build(cfg.model)
+
+    ckpt = model.state_dict()
+    ckpt_filename = tempfile.mktemp()
+    torch.save(ckpt, ckpt_filename)
+
+    # test initialization
+    infer = MMSegInferencer(cfg, ckpt_filename)
+
+    # test forward
+    img = np.random.randint(0, 256, (4, 4, 3))
+    infer(img)
+
+    imgs = [img, img]
+    infer(imgs)
+    results = infer(imgs, out_dir=tempfile.gettempdir())
+
+    # test results
+    assert 'predictions' in results
+    assert 'visualization' in results
+    assert len(results['predictions']) == 2
+    assert results['predictions'][0].shape == (4, 4)
diff --git a/tests/test_apis/test_rs_inferencer.py b/tests/test_apis/test_rs_inferencer.py
new file mode 100644
index 0000000000..03423d9680
--- /dev/null
+++ b/tests/test_apis/test_rs_inferencer.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+from unittest import TestCase
+
+import numpy as np
+from mmengine import ConfigDict, init_default_scope
+from utils import *  # noqa: F401, F403
+
+from mmseg.apis import RSImage, RSInferencer
+from mmseg.registry import MODELS
+
+
+class TestRSImage(TestCase):
+
+    def test_read_whole_image(self):
+        init_default_scope('mmseg')
+        img_path = osp.join(
+            osp.dirname(__file__),
+            '../data/pseudo_loveda_dataset/img_dir/0.png')
+        rs_image = RSImage(img_path)
+        window_size = (16, 16)
+        rs_image.create_grids(window_size)
+        image_data = rs_image.read(rs_image.grids[0])
+        self.assertIsNotNone(image_data)
+
+    def test_write_image_data(self):
+        init_default_scope('mmseg')
+        img_path = osp.join(
+            osp.dirname(__file__),
+            '../data/pseudo_loveda_dataset/img_dir/0.png')
+        rs_image = RSImage(img_path)
+        window_size = (16, 16)
+        rs_image.create_grids(window_size)
+        data = np.random.random((16, 16)).astype(np.int8)
+        rs_image.write(data, rs_image.grids[0])
+
+
+class TestRSInferencer(TestCase):
+
+    def test_read_and_inference(self):
+        init_default_scope('mmseg')
+        cfg_dict = dict(
+            model=dict(
+                type='InferExampleModel',
+                data_preprocessor=dict(type='SegDataPreProcessor'),
+                backbone=dict(type='InferExampleBackbone'),
+                decode_head=dict(type='InferExampleHead'),
+                test_cfg=dict(mode='whole')),
+            test_dataloader=dict(
+                dataset=dict(
+                    type='ExampleDataset',
+                    pipeline=[
+                        dict(type='LoadImageFromFile'),
+                        dict(type='LoadAnnotations'),
+                        dict(type='PackSegInputs')
+                    ])),
+            test_pipeline=[
+                dict(type='LoadImageFromFile'),
+                dict(type='LoadAnnotations'),
+                dict(type='PackSegInputs')
+            ])
+        cfg = ConfigDict(cfg_dict)
+        model = MODELS.build(cfg.model)
+        model.cfg = cfg
+        inferencer = RSInferencer.from_model(model)
+
+        img_path = osp.join(
+            osp.dirname(__file__),
+            '../data/pseudo_loveda_dataset/img_dir/0.png')
+        rs_image = RSImage(img_path)
+        window_size = (16, 16)
+        stride = (16, 16)
+        inferencer.run(rs_image, window_size, stride)
diff --git a/tests/test_apis/utils.py b/tests/test_apis/utils.py
new file mode 100644
index 0000000000..0a9928fccf
--- /dev/null
+++ b/tests/test_apis/utils.py
@@ -0,0 +1,38 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch.nn as nn
+
+from mmseg.models import EncoderDecoder
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.registry import MODELS
+
+
+@MODELS.register_module(name='InferExampleHead')
+class ExampleDecodeHead(BaseDecodeHead):
+
+    def __init__(self, num_classes=19, out_channels=None):
+        super().__init__(
+            3, 3, num_classes=num_classes, out_channels=out_channels)
+
+    def forward(self, inputs):
+        return self.cls_seg(inputs[0])
+
+
+@MODELS.register_module(name='InferExampleBackbone')
+class ExampleBackbone(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 3)
+
+    def init_weights(self, pretrained=None):
+        pass
+
+    def forward(self, x):
+        return [self.conv(x)]
+
+
+@MODELS.register_module(name='InferExampleModel')
+class ExampleModel(EncoderDecoder):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
diff --git a/tests/test_config.py b/tests/test_config.py
index cd99dad5db..cdd85ff57c 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -3,7 +3,10 @@
 import os
 from os.path import dirname, exists, isdir, join, relpath
 
-from mmcv import Config
+import numpy as np
+from mmengine import Config
+from mmengine.dataset import Compose
+from mmengine.registry import init_default_scope
 from torch import nn
 
 from mmseg.models import build_segmentor
@@ -27,8 +30,9 @@ def _get_config_directory():
 def test_config_build_segmentor():
     """Test that all segmentation models defined in the configs can be
     initialized."""
+    init_default_scope('mmseg')
     config_dpath = _get_config_directory()
-    print('Found config_dpath = {!r}'.format(config_dpath))
+    print(f'Found config_dpath = {config_dpath!r}')
 
     config_fpaths = []
     # one config each sub folder
@@ -39,20 +43,20 @@ def test_config_build_segmentor():
     config_fpaths = [p for p in config_fpaths if p.find('_base_') == -1]
     config_names = [relpath(p, config_dpath) for p in config_fpaths]
 
-    print('Using {} config files'.format(len(config_names)))
+    print(f'Using {len(config_names)} config files')
 
     for config_fname in config_names:
         config_fpath = join(config_dpath, config_fname)
         config_mod = Config.fromfile(config_fpath)
 
         config_mod.model
-        print('Building segmentor, config_fpath = {!r}'.format(config_fpath))
+        print(f'Building segmentor, config_fpath = {config_fpath!r}')
 
         # Remove pretrained keys to allow for testing in an offline environment
         if 'pretrained' in config_mod.model:
             config_mod.model['pretrained'] = None
 
-        print('building {}'.format(config_fname))
+        print(f'building {config_fname}')
         segmentor = build_segmentor(config_mod.model)
         assert segmentor is not None
 
@@ -60,72 +64,79 @@ def test_config_build_segmentor():
         _check_decode_head(head_config, segmentor.decode_head)
 
 
-# def test_config_data_pipeline():
-#     """Test whether the data pipeline is valid and can process corner cases.
-
-#     CommandLine:
-#         xdoctest -m tests/test_config.py test_config_build_data_pipeline
-#     """
-#     import numpy as np
-#     from mmcv import Config
-
-#     from mmseg.datasets.transforms import Compose
-
-#     config_dpath = _get_config_directory()
-#     print('Found config_dpath = {!r}'.format(config_dpath))
-
-#     import glob
-#     config_fpaths = list(glob.glob(join(config_dpath, '**', '*.py')))
-#     config_fpaths = [p for p in config_fpaths if p.find('_base_') == -1]
-#     config_names = [relpath(p, config_dpath) for p in config_fpaths]
-
-#     print('Using {} config files'.format(len(config_names)))
-
-#     for config_fname in config_names:
-#         config_fpath = join(config_dpath, config_fname)
-#         print(
-#           'Building data pipeline, config_fpath = {!r}'.format(config_fpath))
-#         config_mod = Config.fromfile(config_fpath)
-
-#         # remove loading pipeline
-#         load_img_pipeline = config_mod.train_pipeline.pop(0)
-#         to_float32 = load_img_pipeline.get('to_float32', False)
-#         config_mod.train_pipeline.pop(0)
-#         config_mod.test_pipeline.pop(0)
-#         # remove loading annotation in test pipeline
-#         config_mod.test_pipeline.pop(1)
-
-#         train_pipeline = Compose(config_mod.train_pipeline)
-#         test_pipeline = Compose(config_mod.test_pipeline)
-
-#         img = np.random.randint(0, 255, size=(1024, 2048, 3), dtype=np.uint8)
-#         if to_float32:
-#             img = img.astype(np.float32)
-#         seg = np.random.randint(0, 255, size=(1024, 2048, 1), dtype=np.uint8)
-
-#         results = dict(
-#             filename='test_img.png',
-#             ori_filename='test_img.png',
-#             img=img,
-#             img_shape=img.shape,
-#             ori_shape=img.shape,
-#             gt_seg_map=seg)
-#         results['seg_fields'] = ['gt_seg_map']
-
-#         print('Test training data pipeline: \n{!r}'.format(train_pipeline))
-#         output_results = train_pipeline(results)
-#         assert output_results is not None
-
-#         results = dict(
-#             filename='test_img.png',
-#             ori_filename='test_img.png',
-#             img=img,
-#             img_shape=img.shape,
-#             ori_shape=img.shape,
-#         )
-#         print('Test testing data pipeline: \n{!r}'.format(test_pipeline))
-#         output_results = test_pipeline(results)
-#         assert output_results is not None
+def test_config_data_pipeline():
+    """Test whether the data pipeline is valid and can process corner cases.
+
+    CommandLine:
+        xdoctest -m tests/test_config.py test_config_build_data_pipeline
+    """
+
+    init_default_scope('mmseg')
+    config_dpath = _get_config_directory()
+    print(f'Found config_dpath = {config_dpath!r}')
+
+    import glob
+    config_fpaths = list(glob.glob(join(config_dpath, '**', '*.py')))
+    config_fpaths = [p for p in config_fpaths if p.find('_base_') == -1]
+    config_names = [relpath(p, config_dpath) for p in config_fpaths]
+
+    print(f'Using {len(config_names)} config files')
+
+    for config_fname in config_names:
+        config_fpath = join(config_dpath, config_fname)
+        print(f'Building data pipeline, config_fpath = {config_fpath!r}')
+        config_mod = Config.fromfile(config_fpath)
+
+        # remove loading pipeline
+        load_img_pipeline = config_mod.train_pipeline.pop(0)
+        to_float32 = load_img_pipeline.get('to_float32', False)
+        del config_mod.train_pipeline[0]
+        del config_mod.test_pipeline[0]
+        # remove loading annotation in test pipeline
+        load_anno_idx = -1
+        for i in range(len(config_mod.test_pipeline)):
+            if config_mod.test_pipeline[i].type in ('LoadAnnotations',
+                                                    'LoadDepthAnnotation'):
+                load_anno_idx = i
+        del config_mod.test_pipeline[load_anno_idx]
+
+        train_pipeline = Compose(config_mod.train_pipeline)
+        test_pipeline = Compose(config_mod.test_pipeline)
+
+        img = np.random.randint(0, 255, size=(1024, 2048, 3), dtype=np.uint8)
+        if to_float32:
+            img = img.astype(np.float32)
+        seg = np.random.randint(0, 255, size=(1024, 2048, 1), dtype=np.uint8)
+        depth = np.random.rand(1024, 2048).astype(np.float32)
+
+        results = dict(
+            filename='test_img.png',
+            ori_filename='test_img.png',
+            img=img,
+            img_shape=img.shape,
+            ori_shape=img.shape,
+            gt_seg_map=seg,
+            gt_depth_map=depth)
+        results['seg_fields'] = ['gt_seg_map']
+        _check_concat_cd_input(config_mod, results)
+        print(f'Test training data pipeline: \n{train_pipeline!r}')
+        output_results = train_pipeline(results)
+        assert output_results is not None
+
+        _check_concat_cd_input(config_mod, results)
+        print(f'Test testing data pipeline: \n{test_pipeline!r}')
+        output_results = test_pipeline(results)
+        assert output_results is not None
+
+
+def _check_concat_cd_input(config_mod: Config, results: dict):
+    keys = []
+    pipeline = config_mod.train_pipeline.copy()
+    pipeline.extend(config_mod.test_pipeline)
+    for t in pipeline:
+        keys.append(t.type)
+    if 'ConcatCDInput' in keys:
+        results.update({'img2': results['img']})
 
 
 def _check_decode_head(decode_head_cfg, decode_head):
@@ -151,14 +162,14 @@ def _check_decode_head(decode_head_cfg, decode_head):
     elif input_transform == 'resize_concat':
         assert sum(in_channels) == decode_head.in_channels
     else:
-        assert isinstance(in_channels, int)
         assert in_channels == decode_head.in_channels
-        assert isinstance(decode_head.in_index, int)
 
     if decode_head_cfg['type'] == 'PointHead':
         assert decode_head_cfg.channels+decode_head_cfg.num_classes == \
                decode_head.fc_seg.in_channels
         assert decode_head.fc_seg.out_channels == decode_head_cfg.num_classes
+    elif decode_head_cfg['type'] == 'VPDDepthHead':
+        assert decode_head.out_channels == 1
     else:
         assert decode_head_cfg.channels == decode_head.conv_seg.in_channels
         assert decode_head.conv_seg.out_channels == decode_head_cfg.num_classes
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index f8c7e0336b..2904e09ced 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -2,16 +2,24 @@
 import os
 import os.path as osp
 import tempfile
-from unittest.mock import MagicMock
 
 import pytest
 
-from mmseg.datasets import (DATASETS, ADE20KDataset, CityscapesDataset,
-                            COCOStuffDataset, CustomDataset, ISPRSDataset,
-                            LoveDADataset, PascalVOCDataset, PotsdamDataset,
+from mmseg.datasets import (ADE20KDataset, BaseSegDataset, BDD100KDataset,
+                            CityscapesDataset, COCOStuffDataset,
+                            DecathlonDataset, DSDLSegDataset, ISPRSDataset,
+                            LIPDataset, LoveDADataset, MapillaryDataset_v1,
+                            MapillaryDataset_v2, NYUDataset, PascalVOCDataset,
+                            PotsdamDataset, REFUGEDataset, SynapseDataset,
                             iSAIDDataset)
+from mmseg.registry import DATASETS
 from mmseg.utils import get_classes, get_palette
 
+try:
+    from dsdl.dataset import DSDLDataset
+except ImportError:
+    DSDLDataset = None
+
 
 def test_classes():
     assert list(
@@ -26,7 +34,11 @@ def test_classes():
     assert list(PotsdamDataset.METAINFO['classes']) == get_classes('potsdam')
     assert list(ISPRSDataset.METAINFO['classes']) == get_classes('vaihingen')
     assert list(iSAIDDataset.METAINFO['classes']) == get_classes('isaid')
-
+    assert list(
+        MapillaryDataset_v1.METAINFO['classes']) == get_classes('mapillary_v1')
+    assert list(
+        MapillaryDataset_v2.METAINFO['classes']) == get_classes('mapillary_v2')
+    assert list(BDD100KDataset.METAINFO['classes']) == get_classes('bdd100k')
     with pytest.raises(ValueError):
         get_classes('unsupported')
 
@@ -79,7 +91,11 @@ def test_palette():
     assert PotsdamDataset.METAINFO['palette'] == get_palette('potsdam')
     assert COCOStuffDataset.METAINFO['palette'] == get_palette('cocostuff')
     assert iSAIDDataset.METAINFO['palette'] == get_palette('isaid')
-
+    assert list(
+        MapillaryDataset_v1.METAINFO['palette']) == get_palette('mapillary_v1')
+    assert list(
+        MapillaryDataset_v2.METAINFO['palette']) == get_palette('mapillary_v2')
+    assert list(BDD100KDataset.METAINFO['palette']) == get_palette('bdd100k')
     with pytest.raises(ValueError):
         get_palette('unsupported')
 
@@ -87,7 +103,7 @@ def test_palette():
 def test_custom_dataset():
 
     # with 'img_path' and 'seg_map_path' in data_prefix
-    train_dataset = CustomDataset(
+    train_dataset = BaseSegDataset(
         data_root=osp.join(osp.dirname(__file__), '../data/pseudo_dataset'),
         data_prefix=dict(
             img_path='imgs/',
@@ -98,7 +114,7 @@ def test_custom_dataset():
     assert len(train_dataset) == 5
 
     # with 'img_path' and 'seg_map_path' in data_prefix and ann_file
-    train_dataset = CustomDataset(
+    train_dataset = BaseSegDataset(
         data_root=osp.join(osp.dirname(__file__), '../data/pseudo_dataset'),
         data_prefix=dict(
             img_path='imgs/',
@@ -110,7 +126,7 @@ def test_custom_dataset():
     assert len(train_dataset) == 4
 
     # no data_root
-    train_dataset = CustomDataset(
+    train_dataset = BaseSegDataset(
         data_prefix=dict(
             img_path=osp.join(
                 osp.dirname(__file__), '../data/pseudo_dataset/imgs'),
@@ -122,7 +138,7 @@ def test_custom_dataset():
 
     # with data_root but 'img_path' and 'seg_map_path' in data_prefix are
     # abs path
-    train_dataset = CustomDataset(
+    train_dataset = BaseSegDataset(
         data_root=osp.join(osp.dirname(__file__), '../data/pseudo_dataset'),
         data_prefix=dict(
             img_path=osp.join(
@@ -134,7 +150,7 @@ def test_custom_dataset():
     assert len(train_dataset) == 5
 
     # test_mode=True
-    test_dataset = CustomDataset(
+    test_dataset = BaseSegDataset(
         data_prefix=dict(
             img_path=osp.join(
                 osp.dirname(__file__), '../data/pseudo_dataset/imgs')),
@@ -219,6 +235,32 @@ def test_vaihingen():
     assert len(test_dataset) == 1
 
 
+def test_synapse():
+    test_dataset = SynapseDataset(
+        pipeline=[],
+        data_prefix=dict(
+            img_path=osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_synapse_dataset/img_dir'),
+            seg_map_path=osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_synapse_dataset/ann_dir')))
+    assert len(test_dataset) == 2
+
+
+def test_refuge():
+    test_dataset = REFUGEDataset(
+        pipeline=[],
+        data_prefix=dict(
+            img_path=osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_refuge_dataset/img_dir'),
+            seg_map_path=osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_refuge_dataset/ann_dir')))
+    assert len(test_dataset) == 1
+
+
 def test_isaid():
     test_dataset = iSAIDDataset(
         pipeline=[],
@@ -242,26 +284,85 @@ def test_isaid():
     assert len(test_dataset) == 1
 
 
+def test_decathlon():
+    data_root = osp.join(osp.dirname(__file__), '../data')
+    # test load training dataset
+    test_dataset = DecathlonDataset(
+        pipeline=[], data_root=data_root, ann_file='dataset.json')
+    assert len(test_dataset) == 1
+
+    # test load test dataset
+    test_dataset = DecathlonDataset(
+        pipeline=[],
+        data_root=data_root,
+        ann_file='dataset.json',
+        test_mode=True)
+    assert len(test_dataset) == 3
+
+
+def test_lip():
+    data_root = osp.join(osp.dirname(__file__), '../data/pseudo_lip_dataset')
+    # train load training dataset
+    train_dataset = LIPDataset(
+        pipeline=[],
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='train_images', seg_map_path='train_segmentations'))
+    assert len(train_dataset) == 1
+
+    # test load training dataset
+    test_dataset = LIPDataset(
+        pipeline=[],
+        data_root=data_root,
+        data_prefix=dict(
+            img_path='val_images', seg_map_path='val_segmentations'))
+    assert len(test_dataset) == 1
+
+
+def test_mapillary():
+    test_dataset = MapillaryDataset_v1(
+        pipeline=[],
+        data_prefix=dict(
+            img_path=osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_mapillary_dataset/images'),
+            seg_map_path=osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_mapillary_dataset/v1.2')))
+    assert len(test_dataset) == 1
+
+
+def test_bdd100k():
+    test_dataset = BDD100KDataset(
+        pipeline=[],
+        data_prefix=dict(
+            img_path=osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_bdd100k_dataset/images/10k/val'),
+            seg_map_path=osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_bdd100k_dataset/labels/sem_seg/masks/val')))
+    assert len(test_dataset) == 3
+
+
 @pytest.mark.parametrize('dataset, classes', [
     ('ADE20KDataset', ('wall', 'building')),
     ('CityscapesDataset', ('road', 'sidewalk')),
-    ('CustomDataset', ('bus', 'car')),
+    ('BaseSegDataset', ('bus', 'car')),
     ('PascalVOCDataset', ('aeroplane', 'bicycle')),
 ])
 def test_custom_classes_override_default(dataset, classes):
 
     dataset_class = DATASETS.get(dataset)
-    if isinstance(dataset_class, PascalVOCDataset):
-        tmp_file = tempfile.NamedTemporaryFile()
-        ann_file = f'{tmp_file.name}.txt'
-    else:
-        ann_file = MagicMock()
-
     original_classes = dataset_class.METAINFO.get('classes', None)
 
+    tmp_file = tempfile.NamedTemporaryFile()
+    ann_file = tmp_file.name
+    img_path = tempfile.mkdtemp()
+
     # Test setting classes as a tuple
     custom_dataset = dataset_class(
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
         ann_file=ann_file,
         metainfo=dict(classes=classes),
         test_mode=True,
@@ -269,12 +370,12 @@ def test_custom_classes_override_default(dataset, classes):
 
     assert custom_dataset.metainfo['classes'] != original_classes
     assert custom_dataset.metainfo['classes'] == classes
-    if not isinstance(custom_dataset, CustomDataset):
+    if not isinstance(custom_dataset, BaseSegDataset):
         assert isinstance(custom_dataset.label_map, dict)
 
     # Test setting classes as a list
     custom_dataset = dataset_class(
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
         ann_file=ann_file,
         metainfo=dict(classes=list(classes)),
         test_mode=True,
@@ -282,34 +383,34 @@ def test_custom_classes_override_default(dataset, classes):
 
     assert custom_dataset.metainfo['classes'] != original_classes
     assert custom_dataset.metainfo['classes'] == list(classes)
-    if not isinstance(custom_dataset, CustomDataset):
+    if not isinstance(custom_dataset, BaseSegDataset):
         assert isinstance(custom_dataset.label_map, dict)
 
     # Test overriding not a subset
     custom_dataset = dataset_class(
         ann_file=ann_file,
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
         metainfo=dict(classes=[classes[0]]),
         test_mode=True,
         lazy_init=True)
 
     assert custom_dataset.metainfo['classes'] != original_classes
     assert custom_dataset.metainfo['classes'] == [classes[0]]
-    if not isinstance(custom_dataset, CustomDataset):
+    if not isinstance(custom_dataset, BaseSegDataset):
         assert isinstance(custom_dataset.label_map, dict)
 
     # Test default behavior
-    if dataset_class is CustomDataset:
+    if dataset_class is BaseSegDataset:
         with pytest.raises(AssertionError):
             custom_dataset = dataset_class(
                 ann_file=ann_file,
-                data_prefix=dict(img_path=MagicMock()),
+                data_prefix=dict(img_path=img_path),
                 metainfo=None,
                 test_mode=True,
                 lazy_init=True)
     else:
         custom_dataset = dataset_class(
-            data_prefix=dict(img_path=MagicMock()),
+            data_prefix=dict(img_path=img_path),
             ann_file=ann_file,
             metainfo=None,
             test_mode=True,
@@ -320,10 +421,10 @@ def test_custom_classes_override_default(dataset, classes):
 
 
 def test_custom_dataset_random_palette_is_generated():
-    dataset = CustomDataset(
+    dataset = BaseSegDataset(
         pipeline=[],
-        data_prefix=dict(img_path=MagicMock()),
-        ann_file=MagicMock(),
+        data_prefix=dict(img_path=tempfile.mkdtemp()),
+        ann_file=tempfile.mkdtemp(),
         metainfo=dict(classes=('bus', 'car')),
         lazy_init=True,
         test_mode=True)
@@ -334,9 +435,9 @@ def test_custom_dataset_random_palette_is_generated():
 
 
 def test_custom_dataset_custom_palette():
-    dataset = CustomDataset(
-        data_prefix=dict(img_path=MagicMock()),
-        ann_file=MagicMock(),
+    dataset = BaseSegDataset(
+        data_prefix=dict(img_path=tempfile.mkdtemp()),
+        ann_file=tempfile.mkdtemp(),
         metainfo=dict(
             classes=('bus', 'car'), palette=[[100, 100, 100], [200, 200,
                                                                200]]),
@@ -346,8 +447,29 @@ def test_custom_dataset_custom_palette():
                                                         [200, 200, 200]])
     # test custom class and palette don't match
     with pytest.raises(ValueError):
-        dataset = CustomDataset(
-            data_prefix=dict(img_path=MagicMock()),
-            ann_file=MagicMock(),
+        dataset = BaseSegDataset(
+            data_prefix=dict(img_path=tempfile.mkdtemp()),
+            ann_file=tempfile.mkdtemp(),
             metainfo=dict(classes=('bus', 'car'), palette=[[200, 200, 200]]),
             lazy_init=True)
+
+
+def test_dsdlseg_dataset():
+    if DSDLDataset is not None:
+        dataset = DSDLSegDataset(
+            data_root='tests/data/dsdl_seg', ann_file='set-train/train.yaml')
+        assert len(dataset) == 3
+        assert len(dataset.metainfo['classes']) == 21
+    else:
+        ImportWarning('Package `dsdl` is not installed.')
+
+
+def test_nyu_dataset():
+    dataset = NYUDataset(
+        data_root='tests/data/pseudo_nyu_dataset',
+        data_prefix=dict(img_path='images', depth_map_path='annotations'),
+    )
+    assert len(dataset) == 1
+    data = dataset[0]
+    assert data.get('depth_map_path', None) is not None
+    assert data.get('category_id', -1) == 26
diff --git a/tests/test_datasets/test_dataset_builder.py b/tests/test_datasets/test_dataset_builder.py
index 7954f3a1a7..b67b1e7aaf 100644
--- a/tests/test_datasets/test_dataset_builder.py
+++ b/tests/test_datasets/test_dataset_builder.py
@@ -2,15 +2,16 @@
 import os.path as osp
 
 from mmengine.dataset import ConcatDataset, RepeatDataset
+from mmengine.registry import init_default_scope
 
-from mmseg.datasets import DATASETS, MultiImageMixDataset
-from mmseg.utils import register_all_modules
+from mmseg.datasets import MultiImageMixDataset
+from mmseg.registry import DATASETS
 
-register_all_modules()
+init_default_scope('mmseg')
 
 
 @DATASETS.register_module()
-class ToyDataset(object):
+class ToyDataset:
 
     def __init__(self, cnt=0):
         self.cnt = cnt
@@ -36,7 +37,7 @@ def test_build_dataset():
 
     # test RepeatDataset
     cfg = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=data_prefix,
@@ -50,13 +51,13 @@ def test_build_dataset():
     # We use same dir twice for simplicity
     # with data_prefix.seg_map_path
     cfg1 = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=data_prefix,
         serialize_data=False)
     cfg2 = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=data_prefix,
@@ -80,14 +81,14 @@ def test_build_dataset():
 
     # with data_prefix.seg_map_path, ann_file
     cfg1 = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=data_prefix,
         ann_file='splits/train.txt',
         serialize_data=False)
     cfg2 = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=data_prefix,
@@ -102,7 +103,7 @@ def test_build_dataset():
 
     # test mode
     cfg1 = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=dict(img_path='imgs/'),
@@ -110,7 +111,7 @@ def test_build_dataset():
         metainfo=dict(classes=('pseudo_class', )),
         serialize_data=False)
     cfg2 = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=dict(img_path='imgs/'),
@@ -126,7 +127,7 @@ def test_build_dataset():
 
     # test mode with ann_files
     cfg1 = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=dict(img_path='imgs/'),
@@ -135,7 +136,7 @@ def test_build_dataset():
         metainfo=dict(classes=('pseudo_class', )),
         serialize_data=False)
     cfg2 = dict(
-        type='CustomDataset',
+        type='BaseSegDataset',
         pipeline=[],
         data_root=data_root,
         data_prefix=dict(img_path='imgs/'),
diff --git a/tests/test_datasets/test_formatting.py b/tests/test_datasets/test_formatting.py
index 87f96037e3..d0e5820ec7 100644
--- a/tests/test_datasets/test_formatting.py
+++ b/tests/test_datasets/test_formatting.py
@@ -4,10 +4,11 @@
 import unittest
 
 import numpy as np
-from mmengine.data import BaseDataElement
+import pytest
+from mmengine.structures import BaseDataElement
 
-from mmseg.data import SegDataSample
 from mmseg.datasets.transforms import PackSegInputs
+from mmseg.structures import SegDataSample
 
 
 class TestPackSegInputs(unittest.TestCase):
@@ -39,12 +40,20 @@ def setUp(self):
     def test_transform(self):
         transform = PackSegInputs(meta_keys=self.meta_keys)
         results = transform(copy.deepcopy(self.results))
-        self.assertIn('data_sample', results)
-        self.assertIsInstance(results['data_sample'], SegDataSample)
-        self.assertIsInstance(results['data_sample'].gt_sem_seg,
+        self.assertIn('data_samples', results)
+        self.assertIsInstance(results['data_samples'], SegDataSample)
+        self.assertIsInstance(results['data_samples'].gt_sem_seg,
                               BaseDataElement)
-        self.assertEqual(results['data_sample'].ori_shape,
-                         results['data_sample'].gt_sem_seg.shape)
+        self.assertEqual(results['data_samples'].ori_shape,
+                         results['data_samples'].gt_sem_seg.shape)
+        results = copy.deepcopy(self.results)
+        # test dataset shape is not 2D
+        results['gt_seg_map'] = np.random.rand(3, 300, 400)
+        msg = 'the segmentation map is 2D'
+        with pytest.warns(UserWarning, match=msg):
+            results = transform(results)
+        self.assertEqual(results['data_samples'].ori_shape,
+                         results['data_samples'].gt_sem_seg.shape)
 
     def test_repr(self):
         transform = PackSegInputs(meta_keys=self.meta_keys)
diff --git a/tests/test_datasets/test_loading.py b/tests/test_datasets/test_loading.py
index 609361163a..3eea6e3f9d 100644
--- a/tests/test_datasets/test_loading.py
+++ b/tests/test_datasets/test_loading.py
@@ -7,10 +7,15 @@
 import numpy as np
 from mmcv.transforms import LoadImageFromFile
 
-from mmseg.datasets.transforms import LoadAnnotations
+from mmseg.datasets.transforms import LoadAnnotations  # noqa
+from mmseg.datasets.transforms import (LoadBiomedicalAnnotation,
+                                       LoadBiomedicalData,
+                                       LoadBiomedicalImageFromFile,
+                                       LoadDepthAnnotation,
+                                       LoadImageFromNDArray)
 
 
-class TestLoading(object):
+class TestLoading:
 
     @classmethod
     def setup_class(cls):
@@ -26,7 +31,7 @@ def test_load_img(self):
         assert results['ori_shape'] == results['img'].shape[:2]
         assert repr(transform) == transform.__class__.__name__ + \
                "(ignore_empty=False, to_float32=False, color_type='color'," + \
-               " imdecode_backend='cv2', file_client_args={'backend': 'disk'})"
+               " imdecode_backend='cv2', backend_args=None)"
 
         # to_float32
         transform = LoadImageFromFile(to_float32=True)
@@ -54,8 +59,8 @@ def test_load_seg(self):
         assert results['gt_seg_map'].shape == (288, 512)
         assert results['gt_seg_map'].dtype == np.uint8
         assert repr(transform) == transform.__class__.__name__ + \
-            "(reduce_zero_label=True,imdecode_backend='pillow')" + \
-            "file_client_args={'backend': 'disk'})"
+            "(reduce_zero_label=True, imdecode_backend='pillow', " + \
+            'backend_args=None)'
 
         # reduce_zero_label
         transform = LoadAnnotations(reduce_zero_label=True)
@@ -140,6 +145,43 @@ def test_load_seg_custom_classes(self):
         assert gt_array.dtype == np.uint8
         np.testing.assert_array_equal(gt_array, true_mask)
 
+        # test with removing a class and reducing zero label simultaneously
+        results = dict(
+            img_path=img_path,
+            seg_map_path=gt_path,
+            # since reduce_zero_label is True, there are only 4 real classes.
+            # if the full set of classes is ["A", "B", "C", "D"], the
+            # following label map simulates the dataset option
+            # classes=["A", "C", "D"] which removes class "B".
+            label_map={
+                0: 0,
+                1: 255,  # simulate removing class 1
+                2: 1,
+                3: 2
+            },
+            reduce_zero_label=True,  # reduce zero label
+            seg_fields=[])
+
+        load_imgs = LoadImageFromFile()
+        results = load_imgs(copy.deepcopy(results))
+
+        # reduce zero label
+        load_anns = LoadAnnotations()
+        results = load_anns(copy.deepcopy(results))
+
+        gt_array = results['gt_seg_map']
+
+        true_mask = np.ones_like(gt_array) * 255  # all zeros get mapped to 255
+        true_mask[2:4, 2:4] = 0  # 1s are reduced to class 0 mapped to class 0
+        true_mask[2:4, 6:8] = 255  # 2s are reduced to class 1 which is removed
+        true_mask[6:8, 2:4] = 1  # 3s are reduced to class 2 mapped to class 1
+        true_mask[6:8, 6:8] = 2  # 4s are reduced to class 3 mapped to class 2
+
+        assert results['seg_fields'] == ['gt_seg_map']
+        assert gt_array.shape == (10, 10)
+        assert gt_array.dtype == np.uint8
+        np.testing.assert_array_equal(gt_array, true_mask)
+
         # test no custom classes
         results = dict(
             img_path=img_path,
@@ -161,3 +203,93 @@ def test_load_seg_custom_classes(self):
         np.testing.assert_array_equal(gt_array, test_gt)
 
         tmp_dir.cleanup()
+
+    def test_load_image_from_ndarray(self):
+        results = {'img': np.zeros((256, 256, 3), dtype=np.uint8)}
+        transform = LoadImageFromNDArray()
+        results = transform(results)
+
+        assert results['img'].shape == (256, 256, 3)
+        assert results['img'].dtype == np.uint8
+        assert results['img_shape'] == (256, 256)
+        assert results['ori_shape'] == (256, 256)
+
+        # to_float32
+        transform = LoadImageFromNDArray(to_float32=True)
+        results = transform(copy.deepcopy(results))
+        assert results['img'].dtype == np.float32
+
+        # test repr
+        transform = LoadImageFromNDArray()
+        assert repr(transform) == ('LoadImageFromNDArray('
+                                   'ignore_empty=False, '
+                                   'to_float32=False, '
+                                   "color_type='color', "
+                                   "imdecode_backend='cv2', "
+                                   'backend_args=None)')
+
+    def test_load_biomedical_img(self):
+        results = dict(
+            img_path=osp.join(self.data_prefix, 'biomedical.nii.gz'))
+        transform = LoadBiomedicalImageFromFile()
+        results = transform(copy.deepcopy(results))
+        assert results['img_path'] == osp.join(self.data_prefix,
+                                               'biomedical.nii.gz')
+        assert len(results['img'].shape) == 4
+        assert results['img'].dtype == np.float32
+        assert results['ori_shape'] == results['img'].shape[1:]
+        assert repr(transform) == ('LoadBiomedicalImageFromFile('
+                                   "decode_backend='nifti', "
+                                   'to_xyz=False, '
+                                   'to_float32=True, '
+                                   'backend_args=None)')
+
+    def test_load_biomedical_annotation(self):
+        results = dict(
+            seg_map_path=osp.join(self.data_prefix, 'biomedical_ann.nii.gz'))
+        transform = LoadBiomedicalAnnotation()
+        results = transform(copy.deepcopy(results))
+        assert len(results['gt_seg_map'].shape) == 3
+        assert results['gt_seg_map'].dtype == np.float32
+
+    def test_load_biomedical_data(self):
+        input_results = dict(
+            img_path=osp.join(self.data_prefix, 'biomedical.npy'))
+        transform = LoadBiomedicalData(with_seg=True)
+        results = transform(copy.deepcopy(input_results))
+        assert results['img_path'] == osp.join(self.data_prefix,
+                                               'biomedical.npy')
+        assert results['img'][0].shape == results['gt_seg_map'].shape
+        assert results['img'].dtype == np.float32
+        assert results['ori_shape'] == results['img'].shape[1:]
+        assert repr(transform) == ('LoadBiomedicalData('
+                                   'with_seg=True, '
+                                   "decode_backend='numpy', "
+                                   'to_xyz=False, '
+                                   'backend_args=None)')
+
+        transform = LoadBiomedicalData(with_seg=False)
+        results = transform(copy.deepcopy(input_results))
+        assert len(results['img'].shape) == 4
+        assert results.get('gt_seg_map') is None
+        assert repr(transform) == ('LoadBiomedicalData('
+                                   'with_seg=False, '
+                                   "decode_backend='numpy', "
+                                   'to_xyz=False, '
+                                   'backend_args=None)')
+
+    def test_load_depth_annotation(self):
+        input_results = dict(
+            img_path='tests/data/pseudo_nyu_dataset/images/'
+            'bookstore_0001d_00001.jpg',
+            depth_map_path='tests/data/pseudo_nyu_dataset/'
+            'annotations/bookstore_0001d_00001.png',
+            category_id=-1,
+            seg_fields=[])
+        transform = LoadDepthAnnotation(depth_rescale_factor=0.001)
+        results = transform(input_results)
+        assert 'gt_depth_map' in results
+        assert results['gt_depth_map'].shape[:2] == mmcv.imread(
+            input_results['depth_map_path']).shape[:2]
+        assert results['gt_depth_map'].dtype == np.float32
+        assert 'gt_depth_map' in results['seg_fields']
diff --git a/tests/test_datasets/test_transform.py b/tests/test_datasets/test_transform.py
index 727ef8feda..e73e558ee8 100644
--- a/tests/test_datasets/test_transform.py
+++ b/tests/test_datasets/test_transform.py
@@ -1,15 +1,23 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import copy
 import os.path as osp
+from unittest import TestCase
 
 import mmcv
 import numpy as np
 import pytest
+from mmengine.registry import init_default_scope
 from PIL import Image
 
-from mmseg.datasets.transforms import PhotoMetricDistortion, RandomCrop
+from mmseg.datasets.transforms import *  # noqa
+from mmseg.datasets.transforms import (LoadBiomedicalData,
+                                       LoadBiomedicalImageFromFile,
+                                       PhotoMetricDistortion, RandomCrop,
+                                       RandomDepthMix)
 from mmseg.registry import TRANSFORMS
 
+init_default_scope('mmseg')
+
 
 def test_resize():
     # Test `Resize`, `RandomResize` and `RandomChoiceResize` from
@@ -70,6 +78,34 @@ def test_resize():
     resized_results = resize_module(results.copy())
     assert max(resized_results['img_shape'][:2]) <= 1333 * 1.1
 
+    # test RandomChoiceResize, which `resize_type` is `ResizeShortestEdge`
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[128, 256, 512],
+        resize_type='ResizeShortestEdge',
+        max_size=1333)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][0] in [128, 256, 512]
+
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[512],
+        resize_type='ResizeShortestEdge',
+        max_size=512)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][1] == 512
+
+    transform = dict(
+        type='RandomChoiceResize',
+        scales=[(128, 256), (256, 512), (512, 1024)],
+        resize_type='ResizeShortestEdge',
+        max_size=1333)
+    resize_module = TRANSFORMS.build(transform)
+    resized_results = resize_module(results.copy())
+    assert resized_results['img_shape'][0] in [128, 256, 512]
+
     # test scale=None and scale_factor is tuple.
     # img shape: (288, 512, 3)
     transform = dict(
@@ -150,6 +186,76 @@ def test_flip():
     assert np.equal(original_img, results['img']).all()
     assert np.equal(original_seg, results['gt_semantic_seg']).all()
 
+    results['gt_depth_map'] = seg
+    results['seg_fields'] = ['gt_depth_map']
+    results = flip_module(results)
+    flip_module = TRANSFORMS.build(transform)
+    results = flip_module(results)
+    assert np.equal(original_img, results['img']).all()
+    assert np.equal(original_seg, results['gt_depth_map']).all()
+
+
+def test_random_rotate_flip():
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomRotFlip', flip_prob=1.5)
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomRotFlip', rotate_prob=1.5)
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomRotFlip', degree=[20, 20, 20])
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomRotFlip', degree=-20)
+        TRANSFORMS.build(transform)
+
+    transform = dict(
+        type='RandomRotFlip', flip_prob=1.0, rotate_prob=0, degree=20)
+    rot_flip_module = TRANSFORMS.build(transform)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(
+            osp.dirname(__file__),
+            '../data/pseudo_synapse_dataset/img_dir/case0005_slice000.jpg'),
+        'color')
+    original_img = copy.deepcopy(img)
+    seg = np.array(
+        Image.open(
+            osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_synapse_dataset/ann_dir/case0005_slice000.png')
+        ))
+    original_seg = copy.deepcopy(seg)
+    results['img'] = img
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+
+    result_flip = rot_flip_module(results)
+    assert original_img.shape == result_flip['img'].shape
+    assert original_seg.shape == result_flip['gt_semantic_seg'].shape
+
+    transform = dict(
+        type='RandomRotFlip', flip_prob=0, rotate_prob=1.0, degree=20)
+    rot_flip_module = TRANSFORMS.build(transform)
+
+    result_rotate = rot_flip_module(results)
+    assert original_img.shape == result_rotate['img'].shape
+    assert original_seg.shape == result_rotate['gt_semantic_seg'].shape
+
+    assert str(transform) == "{'type': 'RandomRotFlip'," \
+                             " 'flip_prob': 0," \
+                             " 'rotate_prob': 1.0," \
+                             " 'degree': 20}"
+
 
 def test_pad():
     # test assertion if both size_divisor and size is None
@@ -226,10 +332,76 @@ def test_random_crop():
 
     results = pipeline(results)
     assert results['img'].shape[:2] == (h - 20, w - 20)
-    assert results['img_shape'][:2] == (h - 20, w - 20)
+    assert results['img_shape'] == (h - 20, w - 20)
     assert results['gt_semantic_seg'].shape[:2] == (h - 20, w - 20)
 
 
+def test_rgb2gray():
+    # test assertion out_channels should be greater than 0
+    with pytest.raises(AssertionError):
+        transform = dict(type='RGB2Gray', out_channels=-1)
+        TRANSFORMS.build(transform)
+    # test assertion weights should be tuple[float]
+    with pytest.raises(AssertionError):
+        transform = dict(type='RGB2Gray', out_channels=1, weights=1.1)
+        TRANSFORMS.build(transform)
+
+    # test out_channels is None
+    transform = dict(type='RGB2Gray')
+    transform = TRANSFORMS.build(transform)
+
+    assert str(transform) == f'RGB2Gray(' \
+                             f'out_channels={None}, ' \
+                             f'weights={(0.299, 0.587, 0.114)})'
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+    h, w, c = img.shape
+    seg = np.array(
+        Image.open(osp.join(osp.dirname(__file__), '../data/seg.png')))
+    results['img'] = img
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+
+    results = transform(results)
+    assert results['img'].shape == (h, w, c)
+    assert results['img_shape'] == (h, w, c)
+    assert results['ori_shape'] == (h, w, c)
+
+    # test out_channels = 2
+    transform = dict(type='RGB2Gray', out_channels=2)
+    transform = TRANSFORMS.build(transform)
+
+    assert str(transform) == f'RGB2Gray(' \
+                             f'out_channels={2}, ' \
+                             f'weights={(0.299, 0.587, 0.114)})'
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+    h, w, c = img.shape
+    seg = np.array(
+        Image.open(osp.join(osp.dirname(__file__), '../data/seg.png')))
+    results['img'] = img
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+
+    results = transform(results)
+    assert results['img'].shape == (h, w, 2)
+    assert results['img_shape'] == (h, w, 2)
+
+
 def test_photo_metric_distortion():
 
     results = dict()
@@ -249,3 +421,853 @@ def test_photo_metric_distortion():
 
     assert (results['gt_semantic_seg'] == seg).all()
     assert results['img_shape'] == img.shape
+
+
+def test_rerange():
+    # test assertion if min_value or max_value is illegal
+    with pytest.raises(AssertionError):
+        transform = dict(type='Rerange', min_value=[0], max_value=[255])
+        TRANSFORMS.build(transform)
+
+    # test assertion if min_value >= max_value
+    with pytest.raises(AssertionError):
+        transform = dict(type='Rerange', min_value=1, max_value=1)
+        TRANSFORMS.build(transform)
+
+    # test assertion if img_min_value == img_max_value
+    with pytest.raises(AssertionError):
+        transform = dict(type='Rerange', min_value=0, max_value=1)
+        transform = TRANSFORMS.build(transform)
+        results = dict()
+        results['img'] = np.array([[1, 1], [1, 1]])
+        transform(results)
+
+    img_rerange_cfg = dict()
+    transform = dict(type='Rerange', **img_rerange_cfg)
+    transform = TRANSFORMS.build(transform)
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+    original_img = copy.deepcopy(img)
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+
+    results = transform(results)
+
+    min_value = np.min(original_img)
+    max_value = np.max(original_img)
+    converted_img = (original_img - min_value) / (max_value - min_value) * 255
+
+    assert np.allclose(results['img'], converted_img)
+    assert str(transform) == f'Rerange(min_value={0}, max_value={255})'
+
+
+def test_CLAHE():
+    # test assertion if clip_limit is None
+    with pytest.raises(AssertionError):
+        transform = dict(type='CLAHE', clip_limit=None)
+        TRANSFORMS.build(transform)
+
+    # test assertion if tile_grid_size is illegal
+    with pytest.raises(AssertionError):
+        transform = dict(type='CLAHE', tile_grid_size=(8.0, 8.0))
+        TRANSFORMS.build(transform)
+
+    # test assertion if tile_grid_size is illegal
+    with pytest.raises(AssertionError):
+        transform = dict(type='CLAHE', tile_grid_size=(9, 9, 9))
+        TRANSFORMS.build(transform)
+
+    transform = dict(type='CLAHE', clip_limit=2)
+    transform = TRANSFORMS.build(transform)
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+    original_img = copy.deepcopy(img)
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+
+    results = transform(results)
+
+    converted_img = np.empty(original_img.shape)
+    for i in range(original_img.shape[2]):
+        converted_img[:, :, i] = mmcv.clahe(
+            np.array(original_img[:, :, i], dtype=np.uint8), 2, (8, 8))
+
+    assert np.allclose(results['img'], converted_img)
+    assert str(transform) == f'CLAHE(clip_limit={2}, tile_grid_size={(8, 8)})'
+
+
+def test_adjust_gamma():
+    # test assertion if gamma <= 0
+    with pytest.raises(AssertionError):
+        transform = dict(type='AdjustGamma', gamma=0)
+        TRANSFORMS.build(transform)
+
+    # test assertion if gamma is list
+    with pytest.raises(AssertionError):
+        transform = dict(type='AdjustGamma', gamma=[1.2])
+        TRANSFORMS.build(transform)
+
+    # test with gamma = 1.2
+    transform = dict(type='AdjustGamma', gamma=1.2)
+    transform = TRANSFORMS.build(transform)
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+    original_img = copy.deepcopy(img)
+    results['img'] = img
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+
+    results = transform(results)
+
+    inv_gamma = 1.0 / 1.2
+    table = np.array([((i / 255.0)**inv_gamma) * 255
+                      for i in np.arange(0, 256)]).astype('uint8')
+    converted_img = mmcv.lut_transform(
+        np.array(original_img, dtype=np.uint8), table)
+    assert np.allclose(results['img'], converted_img)
+    assert str(transform) == f'AdjustGamma(gamma={1.2})'
+
+
+def test_rotate():
+    # test assertion degree should be tuple[float] or float
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomRotate', prob=0.5, degree=-10)
+        TRANSFORMS.build(transform)
+    # test assertion degree should be tuple[float] or float
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomRotate', prob=0.5, degree=(10., 20., 30.))
+        TRANSFORMS.build(transform)
+
+    transform = dict(type='RandomRotate', degree=10., prob=1.)
+    transform = TRANSFORMS.build(transform)
+
+    assert str(transform) == f'RandomRotate(' \
+                             f'prob={1.}, ' \
+                             f'degree=({-10.}, {10.}), ' \
+                             f'pad_val={0}, ' \
+                             f'seg_pad_val={255}, ' \
+                             f'center={None}, ' \
+                             f'auto_bound={False})'
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+    h, w, _ = img.shape
+    seg = np.array(
+        Image.open(osp.join(osp.dirname(__file__), '../data/seg.png')))
+    results['img'] = img
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    # Set initial values for default meta_keys
+    results['pad_shape'] = img.shape
+    results['scale_factor'] = 1.0
+
+    results = transform(results)
+    assert results['img'].shape[:2] == (h, w)
+
+
+def test_seg_rescale():
+    results = dict()
+    seg = np.array(
+        Image.open(osp.join(osp.dirname(__file__), '../data/seg.png')))
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+    h, w = seg.shape
+
+    transform = dict(type='SegRescale', scale_factor=1. / 2)
+    rescale_module = TRANSFORMS.build(transform)
+    rescale_results = rescale_module(results.copy())
+    assert rescale_results['gt_semantic_seg'].shape == (h // 2, w // 2)
+
+    transform = dict(type='SegRescale', scale_factor=1)
+    rescale_module = TRANSFORMS.build(transform)
+    rescale_results = rescale_module(results.copy())
+    assert rescale_results['gt_semantic_seg'].shape == (h, w)
+
+
+def test_mosaic():
+    # test prob
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomMosaic', prob=1.5)
+        TRANSFORMS.build(transform)
+    # test assertion for invalid img_scale
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomMosaic', prob=1, img_scale=640)
+        TRANSFORMS.build(transform)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+    seg = np.array(
+        Image.open(osp.join(osp.dirname(__file__), '../data/seg.png')))
+
+    results['img'] = img
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+
+    transform = dict(type='RandomMosaic', prob=1, img_scale=(10, 12))
+    mosaic_module = TRANSFORMS.build(transform)
+    assert 'Mosaic' in repr(mosaic_module)
+
+    # test assertion for invalid mix_results
+    with pytest.raises(AssertionError):
+        mosaic_module(results)
+
+    results['mix_results'] = [copy.deepcopy(results)] * 3
+    results = mosaic_module(results)
+    assert results['img'].shape[:2] == (20, 24)
+
+    results = dict()
+    results['img'] = img[:, :, 0]
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+
+    transform = dict(type='RandomMosaic', prob=0, img_scale=(10, 12))
+    mosaic_module = TRANSFORMS.build(transform)
+    results['mix_results'] = [copy.deepcopy(results)] * 3
+    results = mosaic_module(results)
+    assert results['img'].shape[:2] == img.shape[:2]
+
+    transform = dict(type='RandomMosaic', prob=1, img_scale=(10, 12))
+    mosaic_module = TRANSFORMS.build(transform)
+    results = mosaic_module(results)
+    assert results['img'].shape[:2] == (20, 24)
+
+    results = dict()
+    results['img'] = np.concatenate((img, img), axis=2)
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+
+    transform = dict(type='RandomMosaic', prob=1, img_scale=(10, 12))
+    mosaic_module = TRANSFORMS.build(transform)
+    results['mix_results'] = [copy.deepcopy(results)] * 3
+    results = mosaic_module(results)
+    assert results['img'].shape[2] == 6
+
+
+def test_cutout():
+    # test prob
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomCutOut', prob=1.5, n_holes=1)
+        TRANSFORMS.build(transform)
+    # test n_holes
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCutOut', prob=0.5, n_holes=(5, 3), cutout_shape=(8, 8))
+        TRANSFORMS.build(transform)
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCutOut',
+            prob=0.5,
+            n_holes=(3, 4, 5),
+            cutout_shape=(8, 8))
+        TRANSFORMS.build(transform)
+    # test cutout_shape and cutout_ratio
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCutOut', prob=0.5, n_holes=1, cutout_shape=8)
+        TRANSFORMS.build(transform)
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCutOut', prob=0.5, n_holes=1, cutout_ratio=0.2)
+        TRANSFORMS.build(transform)
+    # either of cutout_shape and cutout_ratio should be given
+    with pytest.raises(AssertionError):
+        transform = dict(type='RandomCutOut', prob=0.5, n_holes=1)
+        TRANSFORMS.build(transform)
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCutOut',
+            prob=0.5,
+            n_holes=1,
+            cutout_shape=(2, 2),
+            cutout_ratio=(0.4, 0.4))
+        TRANSFORMS.build(transform)
+    # test seg_fill_in
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCutOut',
+            prob=0.5,
+            n_holes=1,
+            cutout_shape=(8, 8),
+            seg_fill_in='a')
+        TRANSFORMS.build(transform)
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='RandomCutOut',
+            prob=0.5,
+            n_holes=1,
+            cutout_shape=(8, 8),
+            seg_fill_in=256)
+        TRANSFORMS.build(transform)
+
+    results = dict()
+    img = mmcv.imread(
+        osp.join(osp.dirname(__file__), '../data/color.jpg'), 'color')
+
+    seg = np.array(
+        Image.open(osp.join(osp.dirname(__file__), '../data/seg.png')))
+
+    results['img'] = img
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+    results['img_shape'] = img.shape
+    results['ori_shape'] = img.shape
+    results['pad_shape'] = img.shape
+    results['img_fields'] = ['img']
+
+    transform = dict(
+        type='RandomCutOut', prob=1, n_holes=1, cutout_shape=(10, 10))
+    cutout_module = TRANSFORMS.build(transform)
+    assert 'cutout_shape' in repr(cutout_module)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() < img.sum()
+
+    transform = dict(
+        type='RandomCutOut', prob=1, n_holes=1, cutout_ratio=(0.8, 0.8))
+    cutout_module = TRANSFORMS.build(transform)
+    assert 'cutout_ratio' in repr(cutout_module)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() < img.sum()
+
+    transform = dict(
+        type='RandomCutOut', prob=0, n_holes=1, cutout_ratio=(0.8, 0.8))
+    cutout_module = TRANSFORMS.build(transform)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() == img.sum()
+    assert cutout_result['gt_semantic_seg'].sum() == seg.sum()
+
+    transform = dict(
+        type='RandomCutOut',
+        prob=1,
+        n_holes=(2, 4),
+        cutout_shape=[(10, 10), (15, 15)],
+        fill_in=(255, 255, 255),
+        seg_fill_in=None)
+    cutout_module = TRANSFORMS.build(transform)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() > img.sum()
+    assert cutout_result['gt_semantic_seg'].sum() == seg.sum()
+
+    transform = dict(
+        type='RandomCutOut',
+        prob=1,
+        n_holes=1,
+        cutout_ratio=(0.8, 0.8),
+        fill_in=(255, 255, 255),
+        seg_fill_in=255)
+    cutout_module = TRANSFORMS.build(transform)
+    cutout_result = cutout_module(copy.deepcopy(results))
+    assert cutout_result['img'].sum() > img.sum()
+    assert cutout_result['gt_semantic_seg'].sum() > seg.sum()
+
+
+def test_resize_to_multiple():
+    transform = dict(type='ResizeToMultiple', size_divisor=32)
+    transform = TRANSFORMS.build(transform)
+
+    img = np.random.randn(213, 232, 3)
+    seg = np.random.randint(0, 19, (213, 232))
+    results = dict()
+    results['img'] = img
+    results['gt_semantic_seg'] = seg
+    results['seg_fields'] = ['gt_semantic_seg']
+    results['img_shape'] = img.shape
+    results['pad_shape'] = img.shape
+
+    results = transform(results)
+    assert results['img'].shape == (224, 256, 3)
+    assert results['gt_semantic_seg'].shape == (224, 256)
+    assert results['img_shape'] == (224, 256)
+
+
+def test_generate_edge():
+    transform = dict(type='GenerateEdge', edge_width=1)
+    transform = TRANSFORMS.build(transform)
+
+    seg_map = np.array([
+        [1, 1, 1, 1, 1],
+        [1, 1, 1, 1, 2],
+        [1, 1, 1, 2, 2],
+        [1, 1, 2, 2, 2],
+        [1, 2, 2, 2, 2],
+        [2, 2, 2, 2, 2],
+    ])
+    results = dict()
+    results['gt_seg_map'] = seg_map
+    results['img_shape'] = seg_map.shape
+
+    results = transform(results)
+    assert np.all(results['gt_edge_map'] == np.array([
+        [0, 0, 0, 1, 0],
+        [0, 0, 1, 1, 1],
+        [0, 1, 1, 1, 0],
+        [1, 1, 1, 0, 0],
+        [1, 1, 0, 0, 0],
+        [1, 0, 0, 0, 0],
+    ]))
+
+
+def test_biomedical3d_random_crop():
+    # test assertion for invalid random crop
+    with pytest.raises(AssertionError):
+        transform = dict(type='BioMedical3DRandomCrop', crop_shape=(-2, -1, 0))
+        transform = TRANSFORMS.build(transform)
+
+    from mmseg.datasets.transforms import (LoadBiomedicalAnnotation,
+                                           LoadBiomedicalImageFromFile)
+    results = dict()
+    results['img_path'] = osp.join(
+        osp.dirname(__file__), '../data', 'biomedical.nii.gz')
+    transform = LoadBiomedicalImageFromFile()
+    results = transform(copy.deepcopy(results))
+
+    results['seg_map_path'] = osp.join(
+        osp.dirname(__file__), '../data', 'biomedical_ann.nii.gz')
+    transform = LoadBiomedicalAnnotation()
+    results = transform(copy.deepcopy(results))
+
+    d, h, w = results['img_shape']
+    transform = dict(
+        type='BioMedical3DRandomCrop',
+        crop_shape=(d - 20, h - 20, w - 20),
+        keep_foreground=True)
+    transform = TRANSFORMS.build(transform)
+    crop_results = transform(results)
+    assert crop_results['img'].shape[1:] == (d - 20, h - 20, w - 20)
+    assert crop_results['img_shape'] == (d - 20, h - 20, w - 20)
+    assert crop_results['gt_seg_map'].shape == (d - 20, h - 20, w - 20)
+
+    transform = dict(
+        type='BioMedical3DRandomCrop',
+        crop_shape=(d - 20, h - 20, w - 20),
+        keep_foreground=False)
+    transform = TRANSFORMS.build(transform)
+    crop_results = transform(results)
+    assert crop_results['img'].shape[1:] == (d - 20, h - 20, w - 20)
+    assert crop_results['img_shape'] == (d - 20, h - 20, w - 20)
+    assert crop_results['gt_seg_map'].shape == (d - 20, h - 20, w - 20)
+
+
+def test_biomedical_gaussian_noise():
+    # test assertion for invalid prob
+    with pytest.raises(AssertionError):
+        transform = dict(type='BioMedicalGaussianNoise', prob=1.5)
+        TRANSFORMS.build(transform)
+
+    # test assertion for invalid std
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalGaussianNoise', prob=0.2, mean=0.5, std=-0.5)
+        TRANSFORMS.build(transform)
+
+    transform = dict(type='BioMedicalGaussianNoise', prob=1.0)
+    noise_module = TRANSFORMS.build(transform)
+    assert str(noise_module) == 'BioMedicalGaussianNoise'\
+                                '(prob=1.0, ' \
+                                'mean=0.0, ' \
+                                'std=0.1)'
+
+    transform = dict(type='BioMedicalGaussianNoise', prob=1.0)
+    noise_module = TRANSFORMS.build(transform)
+    results = dict(
+        img_path=osp.join(osp.dirname(__file__), '../data/biomedical.nii.gz'))
+    from mmseg.datasets.transforms import LoadBiomedicalImageFromFile
+    transform = LoadBiomedicalImageFromFile()
+    results = transform(copy.deepcopy(results))
+    original_img = copy.deepcopy(results['img'])
+    results = noise_module(results)
+    assert original_img.shape == results['img'].shape
+
+
+def test_biomedical_gaussian_blur():
+    # test assertion for invalid prob
+    with pytest.raises(AssertionError):
+        transform = dict(type='BioMedicalGaussianBlur', prob=-1.5)
+        TRANSFORMS.build(transform)
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalGaussianBlur', prob=1.0, sigma_range=0.6)
+        smooth_module = TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalGaussianBlur', prob=1.0, sigma_range=(0.6))
+        smooth_module = TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalGaussianBlur', prob=1.0, sigma_range=(15, 8, 9))
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalGaussianBlur', prob=1.0, sigma_range='0.16')
+        TRANSFORMS.build(transform)
+
+    transform = dict(
+        type='BioMedicalGaussianBlur', prob=1.0, sigma_range=(0.7, 0.8))
+    smooth_module = TRANSFORMS.build(transform)
+    assert str(
+        smooth_module
+    ) == 'BioMedicalGaussianBlur(prob=1.0, ' \
+         'prob_per_channel=0.5, '\
+         'sigma_range=(0.7, 0.8), ' \
+         'different_sigma_per_channel=True, '\
+         'different_sigma_per_axis=True)'
+
+    transform = dict(type='BioMedicalGaussianBlur', prob=1.0)
+    smooth_module = TRANSFORMS.build(transform)
+    assert str(
+        smooth_module
+    ) == 'BioMedicalGaussianBlur(prob=1.0, ' \
+         'prob_per_channel=0.5, '\
+         'sigma_range=(0.5, 1.0), ' \
+         'different_sigma_per_channel=True, '\
+         'different_sigma_per_axis=True)'
+
+    results = dict(
+        img_path=osp.join(osp.dirname(__file__), '../data/biomedical.nii.gz'))
+    from mmseg.datasets.transforms import LoadBiomedicalImageFromFile
+    transform = LoadBiomedicalImageFromFile()
+    results = transform(copy.deepcopy(results))
+    original_img = copy.deepcopy(results['img'])
+    results = smooth_module(results)
+    assert original_img.shape == results['img'].shape
+    # the max value in the smoothed image should be less than the original one
+    assert original_img.max() >= results['img'].max()
+    assert original_img.min() <= results['img'].min()
+
+    transform = dict(
+        type='BioMedicalGaussianBlur',
+        prob=1.0,
+        different_sigma_per_axis=False)
+    smooth_module = TRANSFORMS.build(transform)
+
+    results = dict(
+        img_path=osp.join(osp.dirname(__file__), '../data/biomedical.nii.gz'))
+    from mmseg.datasets.transforms import LoadBiomedicalImageFromFile
+    transform = LoadBiomedicalImageFromFile()
+    results = transform(copy.deepcopy(results))
+    original_img = copy.deepcopy(results['img'])
+    results = smooth_module(results)
+    assert original_img.shape == results['img'].shape
+    # the max value in the smoothed image should be less than the original one
+    assert original_img.max() >= results['img'].max()
+    assert original_img.min() <= results['img'].min()
+
+
+def test_BioMedicalRandomGamma():
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalRandomGamma', prob=-1, gamma_range=(0.7, 1.2))
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalRandomGamma', prob=1.2, gamma_range=(0.7, 1.2))
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalRandomGamma', prob=1.0, gamma_range=(0.7))
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalRandomGamma',
+            prob=1.0,
+            gamma_range=(0.7, 0.2, 0.3))
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalRandomGamma',
+            prob=1.0,
+            gamma_range=(0.7, 2),
+            invert_image=1)
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalRandomGamma',
+            prob=1.0,
+            gamma_range=(0.7, 2),
+            per_channel=1)
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(
+            type='BioMedicalRandomGamma',
+            prob=1.0,
+            gamma_range=(0.7, 2),
+            retain_stats=1)
+        TRANSFORMS.build(transform)
+
+    test_img = 'tests/data/biomedical.nii.gz'
+    results = dict(img_path=test_img)
+    transform = LoadBiomedicalImageFromFile()
+    results = transform(copy.deepcopy(results))
+    origin_img = results['img']
+    transform2 = dict(
+        type='BioMedicalRandomGamma',
+        prob=1.0,
+        gamma_range=(0.7, 2),
+    )
+    transform2 = TRANSFORMS.build(transform2)
+    results = transform2(results)
+    transformed_img = results['img']
+    assert origin_img.shape == transformed_img.shape
+
+
+def test_BioMedical3DPad():
+    # test assertion.
+    with pytest.raises(AssertionError):
+        transform = dict(type='BioMedical3DPad', pad_shape=None)
+        TRANSFORMS.build(transform)
+
+    with pytest.raises(AssertionError):
+        transform = dict(type='BioMedical3DPad', pad_shape=[256, 256])
+        TRANSFORMS.build(transform)
+
+    data_info1 = dict(img=np.random.random((8, 6, 4, 4)))
+
+    transform = dict(type='BioMedical3DPad', pad_shape=(6, 6, 6))
+    transform = TRANSFORMS.build(transform)
+    results = transform(copy.deepcopy(data_info1))
+    assert results['img'].shape[1:] == (6, 6, 6)
+    assert results['pad_shape'] == (6, 6, 6)
+
+    transform = dict(type='BioMedical3DPad', pad_shape=(4, 6, 6))
+    transform = TRANSFORMS.build(transform)
+    results = transform(copy.deepcopy(data_info1))
+    assert results['img'].shape[1:] == (6, 6, 6)
+    assert results['pad_shape'] == (6, 6, 6)
+
+    data_info2 = dict(
+        img=np.random.random((8, 6, 4, 4)),
+        gt_seg_map=np.random.randint(0, 2, (6, 4, 4)))
+
+    transform = dict(type='BioMedical3DPad', pad_shape=(6, 6, 6))
+    transform = TRANSFORMS.build(transform)
+    results = transform(copy.deepcopy(data_info2))
+    assert results['img'].shape[1:] == (6, 6, 6)
+    assert results['gt_seg_map'].shape[1:] == (6, 6, 6)
+    assert results['pad_shape'] == (6, 6, 6)
+
+    transform = dict(type='BioMedical3DPad', pad_shape=(4, 6, 6))
+    transform = TRANSFORMS.build(transform)
+    results = transform(copy.deepcopy(data_info2))
+    assert results['img'].shape[1:] == (6, 6, 6)
+    assert results['gt_seg_map'].shape[1:] == (6, 6, 6)
+    assert results['pad_shape'] == (6, 6, 6)
+
+
+def test_biomedical_3d_flip():
+    # test assertion for invalid prob
+    with pytest.raises(AssertionError):
+        transform = dict(type='BioMedical3DRandomFlip', prob=1.5, axes=(0, 1))
+        transform = TRANSFORMS.build(transform)
+
+    # test assertion for invalid direction
+    with pytest.raises(AssertionError):
+        transform = dict(type='BioMedical3DRandomFlip', prob=1, axes=(0, 1, 3))
+        transform = TRANSFORMS.build(transform)
+
+    # test flip axes are (0, 1, 2)
+    transform = dict(type='BioMedical3DRandomFlip', prob=1, axes=(0, 1, 2))
+    transform = TRANSFORMS.build(transform)
+
+    # test with random 3d data
+    results = dict()
+    results['img_path'] = 'Null'
+    results['img_shape'] = (1, 16, 16, 16)
+    results['img'] = np.random.randn(1, 16, 16, 16)
+    results['gt_seg_map'] = np.random.randint(0, 4, (16, 16, 16))
+
+    original_img = results['img'].copy()
+    original_seg = results['gt_seg_map'].copy()
+
+    # flip first time
+    results = transform(results)
+    with pytest.raises(AssertionError):
+        assert np.equal(original_img, results['img']).all()
+    with pytest.raises(AssertionError):
+        assert np.equal(original_seg, results['gt_seg_map']).all()
+
+    # flip second time
+    results = transform(results)
+    assert np.equal(original_img, results['img']).all()
+    assert np.equal(original_seg, results['gt_seg_map']).all()
+
+    # test with actual data and flip axes are (0, 1)
+    # load biomedical 3d img and seg
+    data_prefix = osp.join(osp.dirname(__file__), '../data')
+    input_results = dict(img_path=osp.join(data_prefix, 'biomedical.npy'))
+    biomedical_loader = LoadBiomedicalData(with_seg=True)
+    data = biomedical_loader(copy.deepcopy(input_results))
+    results = data.copy()
+
+    original_img = data['img'].copy()
+    original_seg = data['gt_seg_map'].copy()
+
+    # test flip axes are (0, 1)
+    transform = dict(type='BioMedical3DRandomFlip', prob=1, axes=(0, 1))
+    transform = TRANSFORMS.build(transform)
+
+    # flip first time
+    results = transform(results)
+    with pytest.raises(AssertionError):
+        assert np.equal(original_img, results['img']).all()
+    with pytest.raises(AssertionError):
+        assert np.equal(original_seg, results['gt_seg_map']).all()
+
+    # flip second time
+    results = transform(results)
+    assert np.equal(original_img, results['img']).all()
+    assert np.equal(original_seg, results['gt_seg_map']).all()
+
+    # test transform with flip axes = (1)
+    transform = dict(type='BioMedical3DRandomFlip', prob=1, axes=(1, ))
+    transform = TRANSFORMS.build(transform)
+    results = data.copy()
+    results = transform(results)
+    results = transform(results)
+    assert np.equal(original_img, results['img']).all()
+    assert np.equal(original_seg, results['gt_seg_map']).all()
+
+    # test transform with swap_label_pairs
+    transform = dict(
+        type='BioMedical3DRandomFlip',
+        prob=1,
+        axes=(1, 2),
+        swap_label_pairs=[(0, 1)])
+    transform = TRANSFORMS.build(transform)
+    results = data.copy()
+    results = transform(results)
+
+    with pytest.raises(AssertionError):
+        assert np.equal(original_seg, results['gt_seg_map']).all()
+
+    # swap twice
+    results = transform(results)
+    assert np.equal(original_img, results['img']).all()
+    assert np.equal(original_seg, results['gt_seg_map']).all()
+
+
+def test_albu_transform():
+    results = dict(
+        img_path=osp.join(osp.dirname(__file__), '../data/color.jpg'))
+
+    # Define simple pipeline
+    load = dict(type='LoadImageFromFile')
+    load = TRANSFORMS.build(load)
+
+    albu_transform = dict(
+        type='Albu', transforms=[dict(type='ChannelShuffle', p=1)])
+    albu_transform = TRANSFORMS.build(albu_transform)
+
+    normalize = dict(type='Normalize', mean=[0] * 3, std=[0] * 3, to_rgb=True)
+    normalize = TRANSFORMS.build(normalize)
+
+    # Execute transforms
+    results = load(results)
+    results = albu_transform(results)
+    results = normalize(results)
+
+    assert results['img'].dtype == np.float32
+
+
+def test_albu_channel_order():
+    results = dict(
+        img_path=osp.join(osp.dirname(__file__), '../data/color.jpg'))
+
+    # Define simple pipeline
+    load = dict(type='LoadImageFromFile')
+    load = TRANSFORMS.build(load)
+
+    # Transform is modifying B channel
+    albu_transform = dict(
+        type='Albu',
+        transforms=[
+            dict(
+                type='RGBShift',
+                r_shift_limit=0,
+                g_shift_limit=0,
+                b_shift_limit=200,
+                p=1)
+        ])
+    albu_transform = TRANSFORMS.build(albu_transform)
+
+    # Execute transforms
+    results_load = load(results)
+    results_albu = albu_transform(results_load)
+
+    # assert only Green and Red channel are not modified
+    np.testing.assert_array_equal(results_albu['img'][..., 1:],
+                                  results_load['img'][..., 1:])
+
+    # assert Blue channel is modified
+    with pytest.raises(AssertionError):
+        np.testing.assert_array_equal(results_albu['img'][..., 0],
+                                      results_load['img'][..., 0])
+
+
+class TestRandomDepthMix(TestCase):
+
+    def setUp(self):
+        self.transform = RandomDepthMix(prob=1.0)
+
+    def test_transform_shape(self):
+        # Create a dummy result dict
+        results = {
+            'img_shape': (10, 10),
+            'img': np.random.rand(10, 10, 3),
+            'gt_depth_map': np.random.rand(10, 10)
+        }
+        transformed = self.transform.transform(results)
+
+        # Check if the shape remains the same
+        self.assertEqual(results['img'].shape, transformed['img'].shape)
+
+    def test_transform_values(self):
+        # Create a dummy result dict
+        results = {
+            'img_shape': (10, 10),
+            'img': np.zeros((10, 10, 3)),
+            'gt_depth_map': np.ones((10, 10))
+        }
+        transformed = self.transform.transform(results)
+
+        # Assuming the transformation modifies a portion of the image,
+        # it shouldn't remain all zeros
+        self.assertFalse(np.all(transformed['img'] == 0))
+
+    def test_invalid_image_dimension(self):
+        # Create a dummy result dict with invalid image dimension
+        results = {
+            'img_shape': (10, 10),
+            'img': np.random.rand(10, 10, 3, 3),
+            'gt_depth_map': np.random.rand(10, 10)
+        }
+
+        # Check if a ValueError is raised for invalid dimension
+        with self.assertRaises(ValueError):
+            self.transform.transform(results)
diff --git a/tests/test_datasets/test_tta.py b/tests/test_datasets/test_tta.py
index 6fd4857280..25b1ecdb53 100644
--- a/tests/test_datasets/test_tta.py
+++ b/tests/test_datasets/test_tta.py
@@ -9,44 +9,20 @@
 
 
 def test_multi_scale_flip_aug():
-    # test assertion if scales=None, scale_factor=1 (not float).
-    with pytest.raises(AssertionError):
+    # test exception
+    with pytest.raises(TypeError):
         tta_transform = dict(
-            type='MultiScaleFlipAug',
-            scales=None,
-            scale_factor=1,
+            type='TestTimeAug',
             transforms=[dict(type='Resize', keep_ratio=False)],
         )
         TRANSFORMS.build(tta_transform)
 
-    # test assertion if scales=None, scale_factor=None.
-    with pytest.raises(AssertionError):
-        tta_transform = dict(
-            type='MultiScaleFlipAug',
-            scales=None,
-            scale_factor=None,
-            transforms=[dict(type='Resize', keep_ratio=False)],
-        )
-        TRANSFORMS.build(tta_transform)
-
-    # test assertion if scales=(512, 512), scale_factor=1 (not float).
-    with pytest.raises(AssertionError):
-        tta_transform = dict(
-            type='MultiScaleFlipAug',
-            scales=(512, 512),
-            scale_factor=1,
-            transforms=[dict(type='Resize', keep_ratio=False)],
-        )
-        TRANSFORMS.build(tta_transform)
-    meta_keys = ('img', 'ori_shape', 'ori_height', 'ori_width', 'pad_shape',
-                 'scale_factor', 'scale', 'flip')
     tta_transform = dict(
-        type='MultiScaleFlipAug',
-        scales=[(256, 256), (512, 512), (1024, 1024)],
-        allow_flip=False,
-        resize_cfg=dict(type='Resize', keep_ratio=False),
-        transforms=[dict(type='mmseg.PackSegInputs', meta_keys=meta_keys)],
-    )
+        type='TestTimeAug',
+        transforms=[[
+            dict(type='Resize', scale=scale, keep_ratio=False)
+            for scale in [(256, 256), (512, 512), (1024, 1024)]
+        ], [dict(type='mmseg.PackSegInputs')]])
     tta_module = TRANSFORMS.build(tta_transform)
 
     results = dict()
@@ -62,90 +38,94 @@ def test_multi_scale_flip_aug():
     results['scale_factor'] = 1.0
 
     tta_results = tta_module(results.copy())
-    assert [data_sample.scale
-            for data_sample in tta_results['data_sample']] == [(256, 256),
-                                                               (512, 512),
-                                                               (1024, 1024)]
-    assert [data_sample.flip for data_sample in tta_results['data_sample']
-            ] == [False, False, False]
+    assert [img.shape for img in tta_results['inputs']] == [(3, 256, 256),
+                                                            (3, 512, 512),
+                                                            (3, 1024, 1024)]
 
     tta_transform = dict(
-        type='MultiScaleFlipAug',
-        scales=[(256, 256), (512, 512), (1024, 1024)],
-        allow_flip=True,
-        resize_cfg=dict(type='Resize', keep_ratio=False),
-        transforms=[dict(type='mmseg.PackSegInputs', meta_keys=meta_keys)],
-    )
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale=scale, keep_ratio=False)
+                for scale in [(256, 256), (512, 512), (1024, 1024)]
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='mmseg.PackSegInputs')]
+        ])
     tta_module = TRANSFORMS.build(tta_transform)
-    tta_results = tta_module(results.copy())
-    assert [data_sample.scale
-            for data_sample in tta_results['data_sample']] == [(256, 256),
-                                                               (256, 256),
-                                                               (512, 512),
-                                                               (512, 512),
-                                                               (1024, 1024),
-                                                               (1024, 1024)]
-    assert [data_sample.flip for data_sample in tta_results['data_sample']
-            ] == [False, True, False, True, False, True]
+    tta_results: dict = tta_module(results.copy())
+    assert [img.shape for img in tta_results['inputs']] == [(3, 256, 256),
+                                                            (3, 256, 256),
+                                                            (3, 512, 512),
+                                                            (3, 512, 512),
+                                                            (3, 1024, 1024),
+                                                            (3, 1024, 1024)]
+    assert [
+        data_sample.metainfo['flip']
+        for data_sample in tta_results['data_samples']
+    ] == [False, True, False, True, False, True]
 
     tta_transform = dict(
-        type='MultiScaleFlipAug',
-        scales=[(512, 512)],
-        allow_flip=False,
-        resize_cfg=dict(type='Resize', keep_ratio=False),
-        transforms=[dict(type='mmseg.PackSegInputs', meta_keys=meta_keys)],
-    )
+        type='TestTimeAug',
+        transforms=[[dict(type='Resize', scale=(512, 512), keep_ratio=False)],
+                    [dict(type='mmseg.PackSegInputs')]])
     tta_module = TRANSFORMS.build(tta_transform)
     tta_results = tta_module(results.copy())
-    assert [tta_results['data_sample'][0].scale] == [(512, 512)]
-    assert [tta_results['data_sample'][0].flip] == [False]
+    assert [tta_results['inputs'][0].shape] == [(3, 512, 512)]
 
     tta_transform = dict(
-        type='MultiScaleFlipAug',
-        scales=[(512, 512)],
-        allow_flip=True,
-        resize_cfg=dict(type='Resize', keep_ratio=False),
-        transforms=[dict(type='mmseg.PackSegInputs', meta_keys=meta_keys)],
-    )
+        type='TestTimeAug',
+        transforms=[
+            [dict(type='Resize', scale=(512, 512), keep_ratio=False)],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='mmseg.PackSegInputs')]
+        ])
     tta_module = TRANSFORMS.build(tta_transform)
     tta_results = tta_module(results.copy())
-    assert [data_sample.scale
-            for data_sample in tta_results['data_sample']] == [(512, 512),
-                                                               (512, 512)]
-    assert [data_sample.flip
-            for data_sample in tta_results['data_sample']] == [False, True]
+    assert [img.shape for img in tta_results['inputs']] == [(3, 512, 512),
+                                                            (3, 512, 512)]
+    assert [
+        data_sample.metainfo['flip']
+        for data_sample in tta_results['data_samples']
+    ] == [False, True]
 
     tta_transform = dict(
-        type='MultiScaleFlipAug',
-        scale_factor=[0.5, 1.0, 2.0],
-        allow_flip=False,
-        resize_cfg=dict(type='Resize', keep_ratio=False),
-        transforms=[dict(type='mmseg.PackSegInputs', meta_keys=meta_keys)],
-    )
+        type='TestTimeAug',
+        transforms=[[
+            dict(type='Resize', scale_factor=r, keep_ratio=False)
+            for r in [0.5, 1.0, 2.0]
+        ], [dict(type='mmseg.PackSegInputs')]])
     tta_module = TRANSFORMS.build(tta_transform)
     tta_results = tta_module(results.copy())
-    assert [data_sample.scale
-            for data_sample in tta_results['data_sample']] == [(256, 144),
-                                                               (512, 288),
-                                                               (1024, 576)]
-    assert [data_sample.flip for data_sample in tta_results['data_sample']
-            ] == [False, False, False]
+    assert [img.shape for img in tta_results['inputs']] == [(3, 144, 256),
+                                                            (3, 288, 512),
+                                                            (3, 576, 1024)]
 
     tta_transform = dict(
-        type='MultiScaleFlipAug',
-        scale_factor=[0.5, 1.0, 2.0],
-        allow_flip=True,
-        resize_cfg=dict(type='Resize', keep_ratio=False),
-        transforms=[dict(type='mmseg.PackSegInputs', meta_keys=meta_keys)],
-    )
+        type='TestTimeAug',
+        transforms=[
+            [
+                dict(type='Resize', scale_factor=r, keep_ratio=True)
+                for r in [0.5, 1.0, 2.0]
+            ],
+            [
+                dict(type='RandomFlip', prob=0., direction='horizontal'),
+                dict(type='RandomFlip', prob=1., direction='horizontal')
+            ], [dict(type='mmseg.PackSegInputs')]
+        ])
     tta_module = TRANSFORMS.build(tta_transform)
     tta_results = tta_module(results.copy())
-    assert [data_sample.scale
-            for data_sample in tta_results['data_sample']] == [(256, 144),
-                                                               (256, 144),
-                                                               (512, 288),
-                                                               (512, 288),
-                                                               (1024, 576),
-                                                               (1024, 576)]
-    assert [data_sample.flip for data_sample in tta_results['data_sample']
-            ] == [False, True, False, True, False, True]
+    assert [img.shape for img in tta_results['inputs']] == [(3, 144, 256),
+                                                            (3, 144, 256),
+                                                            (3, 288, 512),
+                                                            (3, 288, 512),
+                                                            (3, 576, 1024),
+                                                            (3, 576, 1024)]
+    assert [
+        data_sample.metainfo['flip']
+        for data_sample in tta_results['data_samples']
+    ] == [False, True, False, True, False, True]
diff --git a/tests/test_engine/test_layer_decay_optimizer_constructor.py b/tests/test_engine/test_layer_decay_optimizer_constructor.py
index 72dc6c5123..e7d13db1d6 100644
--- a/tests/test_engine/test_layer_decay_optimizer_constructor.py
+++ b/tests/test_engine/test_layer_decay_optimizer_constructor.py
@@ -5,12 +5,12 @@
 import torch.nn as nn
 from mmcv.cnn import ConvModule
 from mmengine.optim.optimizer import build_optim_wrapper
+from mmengine.registry import init_default_scope
 
 from mmseg.engine.optimizers.layer_decay_optimizer_constructor import \
     LearningRateDecayOptimizerConstructor
-from mmseg.utils import register_all_modules
 
-register_all_modules()
+init_default_scope('mmseg')
 
 base_lr = 1
 decay_rate = 2
diff --git a/tests/test_engine/test_visualization_hook.py b/tests/test_engine/test_visualization_hook.py
new file mode 100644
index 0000000000..274b0e547f
--- /dev/null
+++ b/tests/test_engine/test_visualization_hook.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+from unittest.mock import Mock
+
+import torch
+from mmengine.structures import PixelData
+
+from mmseg.engine.hooks import SegVisualizationHook
+from mmseg.structures import SegDataSample
+from mmseg.visualization import SegLocalVisualizer
+
+
+class TestVisualizationHook(TestCase):
+
+    def setUp(self) -> None:
+
+        h = 288
+        w = 512
+        num_class = 2
+
+        SegLocalVisualizer.get_instance('visualizer')
+        SegLocalVisualizer.dataset_meta = dict(
+            classes=('background', 'foreground'),
+            palette=[[120, 120, 120], [6, 230, 230]])
+
+        data_sample = SegDataSample()
+        data_sample.set_metainfo({'img_path': 'tests/data/color.jpg'})
+        self.data_batch = [{'data_sample': data_sample}] * 2
+
+        pred_sem_seg_data = dict(data=torch.randint(0, num_class, (1, h, w)))
+        pred_sem_seg = PixelData(**pred_sem_seg_data)
+        pred_seg_data_sample = SegDataSample()
+        pred_seg_data_sample.set_metainfo({'img_path': 'tests/data/color.jpg'})
+        pred_seg_data_sample.pred_sem_seg = pred_sem_seg
+        self.outputs = [pred_seg_data_sample] * 2
+
+    def test_after_iter(self):
+        runner = Mock()
+        runner.iter = 1
+        hook = SegVisualizationHook(draw=True, interval=1)
+        hook._after_iter(
+            runner, 1, self.data_batch, self.outputs, mode='train')
+        hook._after_iter(runner, 1, self.data_batch, self.outputs, mode='val')
+        hook._after_iter(runner, 1, self.data_batch, self.outputs, mode='test')
+
+    def test_after_val_iter(self):
+        runner = Mock()
+        runner.iter = 2
+        hook = SegVisualizationHook(interval=1)
+        hook.after_val_iter(runner, 1, self.data_batch, self.outputs)
+
+        hook = SegVisualizationHook(draw=True, interval=1)
+        hook.after_val_iter(runner, 1, self.data_batch, self.outputs)
+
+        hook = SegVisualizationHook(
+            draw=True, interval=1, show=True, wait_time=1)
+        hook.after_val_iter(runner, 1, self.data_batch, self.outputs)
+
+    def test_after_test_iter(self):
+        runner = Mock()
+        runner.iter = 3
+        hook = SegVisualizationHook(draw=True, interval=1)
+        hook.after_test_iter(runner, 1, self.data_batch, self.outputs)
diff --git a/tests/test_evaluation/test_metrics/test_citys_metric.py b/tests/test_evaluation/test_metrics/test_citys_metric.py
new file mode 100644
index 0000000000..06f956f54a
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_citys_metric.py
@@ -0,0 +1,119 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import shutil
+from unittest import TestCase
+
+import numpy as np
+import pytest
+import torch
+from mmengine.structures import PixelData
+
+from mmseg.evaluation import CityscapesMetric
+from mmseg.structures import SegDataSample
+
+
+class TestCityscapesMetric(TestCase):
+
+    def _demo_mm_inputs(self,
+                        batch_size=1,
+                        image_shapes=(3, 128, 256),
+                        num_classes=5):
+        """Create a superset of inputs needed to run test or train batches.
+
+        Args:
+            batch_size (int): batch size. Default to 2.
+            image_shapes (List[tuple], Optional): image shape.
+                Default to (3, 64, 64)
+            num_classes (int): number of different classes.
+                Default to 5.
+        """
+        if isinstance(image_shapes, list):
+            assert len(image_shapes) == batch_size
+        else:
+            image_shapes = [image_shapes] * batch_size
+
+        packed_inputs = []
+        for idx in range(batch_size):
+            image_shape = image_shapes[idx]
+            _, h, w = image_shape
+
+            data_sample = SegDataSample()
+            gt_semantic_seg = np.random.randint(
+                0, num_classes, (1, h, w), dtype=np.uint8)
+            gt_semantic_seg = torch.LongTensor(gt_semantic_seg)
+            gt_sem_seg_data = dict(data=gt_semantic_seg)
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+            data_sample = data_sample.to_dict()
+            data_sample[
+                'seg_map_path'] = 'tests/data/pseudo_cityscapes_dataset/gtFine/val/frankfurt/frankfurt_000000_000294_gtFine_labelTrainIds.png'  # noqa
+            packed_inputs.append(data_sample)
+
+        return packed_inputs
+
+    def _demo_mm_model_output(self,
+                              batch_size=1,
+                              image_shapes=(3, 128, 256),
+                              num_classes=5):
+        """Create a superset of inputs needed to run test or train batches.
+
+        Args:
+            batch_size (int): batch size. Default to 2.
+            image_shapes (List[tuple], Optional): image shape.
+                Default to (3, 64, 64)
+            num_classes (int): number of different classes.
+                Default to 5.
+        """
+        results_dict = dict()
+        _, h, w = image_shapes
+        seg_logit = torch.randn(batch_size, num_classes, h, w)
+        results_dict['seg_logits'] = seg_logit
+        seg_pred = np.random.randint(
+            0, num_classes, (batch_size, h, w), dtype=np.uint8)
+        seg_pred = torch.LongTensor(seg_pred)
+        results_dict['pred_sem_seg'] = seg_pred
+
+        batch_datasampes = [
+            SegDataSample()
+            for _ in range(results_dict['pred_sem_seg'].shape[0])
+        ]
+        for key, value in results_dict.items():
+            for i in range(value.shape[0]):
+                setattr(batch_datasampes[i], key, PixelData(data=value[i]))
+
+        _predictions = []
+        for pred in batch_datasampes:
+            test_data = pred.to_dict()
+            test_data[
+                'img_path'] = 'tests/data/pseudo_cityscapes_dataset/leftImg8bit/val/frankfurt/frankfurt_000000_000294_leftImg8bit.png'  # noqa
+            _predictions.append(test_data)
+
+        return _predictions
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+
+        data_batch = self._demo_mm_inputs(2)
+        predictions = self._demo_mm_model_output(2)
+        data_samples = [
+            dict(**data, **result)
+            for data, result in zip(data_batch, predictions)
+        ]
+        # test keep_results should be True when format_only is True
+        with pytest.raises(AssertionError):
+            CityscapesMetric(
+                output_dir='tmp', format_only=True, keep_results=False)
+
+        # test evaluate with cityscape metric
+        metric = CityscapesMetric(output_dir='tmp')
+        metric.process(data_batch, data_samples)
+        res = metric.evaluate(2)
+        self.assertIsInstance(res, dict)
+
+        # test format_only
+        metric = CityscapesMetric(
+            output_dir='tmp', format_only=True, keep_results=True)
+        metric.process(data_batch, data_samples)
+        metric.evaluate(2)
+        assert osp.exists('tmp')
+        assert osp.isfile('tmp/frankfurt_000000_000294_leftImg8bit.png')
+        shutil.rmtree('tmp')
diff --git a/tests/test_evaluation/test_metrics/test_depth_metric.py b/tests/test_evaluation/test_metrics/test_depth_metric.py
new file mode 100644
index 0000000000..a172db8fa2
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_depth_metric.py
@@ -0,0 +1,85 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import shutil
+from unittest import TestCase
+
+import torch
+from mmengine.structures import PixelData
+
+from mmseg.evaluation import DepthMetric
+from mmseg.structures import SegDataSample
+
+
+class TestDepthMetric(TestCase):
+
+    def _demo_mm_inputs(self,
+                        batch_size=2,
+                        image_shapes=(3, 64, 64),
+                        num_classes=5):
+        """Create a superset of inputs needed to run test or train batches.
+
+        Args:
+            batch_size (int): batch size. Default to 2.
+            image_shapes (List[tuple], Optional): image shape.
+                Default to (3, 64, 64)
+            num_classes (int): number of different classes.
+                Default to 5.
+        """
+        if isinstance(image_shapes, list):
+            assert len(image_shapes) == batch_size
+        else:
+            image_shapes = [image_shapes] * batch_size
+
+        data_samples = []
+        for idx in range(batch_size):
+            image_shape = image_shapes[idx]
+            _, h, w = image_shape
+
+            data_sample = SegDataSample()
+            gt_depth_map = torch.rand((1, h, w)) * 10
+            data_sample.gt_depth_map = PixelData(data=gt_depth_map)
+
+            data_samples.append(data_sample.to_dict())
+
+        return data_samples
+
+    def _demo_mm_model_output(self,
+                              data_samples,
+                              batch_size=2,
+                              image_shapes=(3, 64, 64),
+                              num_classes=5):
+
+        _, h, w = image_shapes
+
+        for data_sample in data_samples:
+            data_sample['pred_depth_map'] = dict(data=torch.randn(1, h, w))
+
+            data_sample[
+                'img_path'] = 'tests/data/pseudo_dataset/imgs/00000_img.jpg'
+        return data_samples
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+
+        data_samples = self._demo_mm_inputs()
+        data_samples = self._demo_mm_model_output(data_samples)
+
+        depth_metric = DepthMetric()
+        depth_metric.process([0] * len(data_samples), data_samples)
+        res = depth_metric.compute_metrics(depth_metric.results)
+        self.assertIsInstance(res, dict)
+
+        # test save depth map file in output_dir
+        depth_metric = DepthMetric(output_dir='tmp')
+        depth_metric.process([0] * len(data_samples), data_samples)
+        assert osp.exists('tmp')
+        assert osp.isfile('tmp/00000_img.png')
+        shutil.rmtree('tmp')
+
+        # test format_only
+        depth_metric = DepthMetric(output_dir='tmp', format_only=True)
+        depth_metric.process([0] * len(data_samples), data_samples)
+        assert depth_metric.results == []
+        assert osp.exists('tmp')
+        assert osp.isfile('tmp/00000_img.png')
+        shutil.rmtree('tmp')
diff --git a/tests/test_evaluation/test_metrics/test_iou_metric.py b/tests/test_evaluation/test_metrics/test_iou_metric.py
new file mode 100644
index 0000000000..7a0e9d53e3
--- /dev/null
+++ b/tests/test_evaluation/test_metrics/test_iou_metric.py
@@ -0,0 +1,104 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+import shutil
+from unittest import TestCase
+
+import numpy as np
+import torch
+from mmengine.structures import PixelData
+
+from mmseg.evaluation import IoUMetric
+from mmseg.structures import SegDataSample
+
+
+class TestIoUMetric(TestCase):
+
+    def _demo_mm_inputs(self,
+                        batch_size=2,
+                        image_shapes=(3, 64, 64),
+                        num_classes=5):
+        """Create a superset of inputs needed to run test or train batches.
+
+        Args:
+            batch_size (int): batch size. Default to 2.
+            image_shapes (List[tuple], Optional): image shape.
+                Default to (3, 64, 64)
+            num_classes (int): number of different classes.
+                Default to 5.
+        """
+        if isinstance(image_shapes, list):
+            assert len(image_shapes) == batch_size
+        else:
+            image_shapes = [image_shapes] * batch_size
+
+        data_samples = []
+        for idx in range(batch_size):
+            image_shape = image_shapes[idx]
+            _, h, w = image_shape
+
+            data_sample = SegDataSample()
+            gt_semantic_seg = np.random.randint(
+                0, num_classes, (1, h, w), dtype=np.uint8)
+            gt_semantic_seg = torch.LongTensor(gt_semantic_seg)
+            gt_sem_seg_data = dict(data=gt_semantic_seg)
+            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+            data_samples.append(data_sample.to_dict())
+
+        return data_samples
+
+    def _demo_mm_model_output(self,
+                              data_samples,
+                              batch_size=2,
+                              image_shapes=(3, 64, 64),
+                              num_classes=5):
+
+        _, h, w = image_shapes
+
+        for data_sample in data_samples:
+            data_sample['seg_logits'] = dict(
+                data=torch.randn(num_classes, h, w))
+            data_sample['pred_sem_seg'] = dict(
+                data=torch.randint(0, num_classes, (1, h, w)))
+            data_sample[
+                'img_path'] = 'tests/data/pseudo_dataset/imgs/00000_img.jpg'
+        return data_samples
+
+    def test_evaluate(self):
+        """Test using the metric in the same way as Evalutor."""
+
+        data_samples = self._demo_mm_inputs()
+        data_samples = self._demo_mm_model_output(data_samples)
+
+        iou_metric = IoUMetric(iou_metrics=['mIoU'])
+        iou_metric.dataset_meta = dict(
+            classes=['wall', 'building', 'sky', 'floor', 'tree'],
+            label_map=dict(),
+            reduce_zero_label=False)
+        iou_metric.process([0] * len(data_samples), data_samples)
+        res = iou_metric.evaluate(2)
+        self.assertIsInstance(res, dict)
+
+        # test save segment file in output_dir
+        iou_metric = IoUMetric(iou_metrics=['mIoU'], output_dir='tmp')
+        iou_metric.dataset_meta = dict(
+            classes=['wall', 'building', 'sky', 'floor', 'tree'],
+            label_map=dict(),
+            reduce_zero_label=False)
+        iou_metric.process([0] * len(data_samples), data_samples)
+        assert osp.exists('tmp')
+        assert osp.isfile('tmp/00000_img.png')
+        shutil.rmtree('tmp')
+
+        # test format_only
+        iou_metric = IoUMetric(
+            iou_metrics=['mIoU'], output_dir='tmp', format_only=True)
+        iou_metric.dataset_meta = dict(
+            classes=['wall', 'building', 'sky', 'floor', 'tree'],
+            label_map=dict(),
+            reduce_zero_label=False)
+        iou_metric.process([0] * len(data_samples), data_samples)
+        assert iou_metric.results == []
+        assert osp.exists('tmp')
+        assert osp.isfile('tmp/00000_img.png')
+        shutil.rmtree('tmp')
diff --git a/tests/test_metrics/test_citys_metric.py b/tests/test_metrics/test_citys_metric.py
deleted file mode 100644
index 5a67bc07c4..0000000000
--- a/tests/test_metrics/test_citys_metric.py
+++ /dev/null
@@ -1,112 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from unittest import TestCase
-
-import numpy as np
-import torch
-from mmengine.data import BaseDataElement, PixelData
-
-from mmseg.data import SegDataSample
-from mmseg.metrics import CitysMetric
-
-
-class TestCitysMetric(TestCase):
-
-    def _demo_mm_inputs(self,
-                        batch_size=1,
-                        image_shapes=(3, 128, 256),
-                        num_classes=5):
-        """Create a superset of inputs needed to run test or train batches.
-
-        Args:
-            batch_size (int): batch size. Default to 2.
-            image_shapes (List[tuple], Optional): image shape.
-                Default to (3, 64, 64)
-            num_classes (int): number of different classes.
-                Default to 5.
-        """
-        if isinstance(image_shapes, list):
-            assert len(image_shapes) == batch_size
-        else:
-            image_shapes = [image_shapes] * batch_size
-
-        packed_inputs = []
-        for idx in range(batch_size):
-            image_shape = image_shapes[idx]
-            _, h, w = image_shape
-
-            mm_inputs = dict()
-            data_sample = SegDataSample()
-            gt_semantic_seg = np.random.randint(
-                0, num_classes, (1, h, w), dtype=np.uint8)
-            gt_semantic_seg = torch.LongTensor(gt_semantic_seg)
-            gt_sem_seg_data = dict(data=gt_semantic_seg)
-            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
-            mm_inputs['data_sample'] = data_sample.to_dict()
-            mm_inputs['data_sample']['seg_map_path'] = \
-                'tests/data/pseudo_cityscapes_dataset/gtFine/val/\
-                    frankfurt/frankfurt_000000_000294_gtFine_labelTrainIds.png'
-
-            packed_inputs.append(mm_inputs)
-
-        return packed_inputs
-
-    def _demo_mm_model_output(self,
-                              batch_size=1,
-                              image_shapes=(3, 128, 256),
-                              num_classes=5):
-        """Create a superset of inputs needed to run test or train batches.
-
-        Args:
-            batch_size (int): batch size. Default to 2.
-            image_shapes (List[tuple], Optional): image shape.
-                Default to (3, 64, 64)
-            num_classes (int): number of different classes.
-                Default to 5.
-        """
-        results_dict = dict()
-        _, h, w = image_shapes
-        seg_logit = torch.randn(batch_size, num_classes, h, w)
-        results_dict['seg_logits'] = seg_logit
-        seg_pred = np.random.randint(
-            0, num_classes, (batch_size, h, w), dtype=np.uint8)
-        seg_pred = torch.LongTensor(seg_pred)
-        results_dict['pred_sem_seg'] = seg_pred
-
-        batch_datasampes = [
-            SegDataSample()
-            for _ in range(results_dict['pred_sem_seg'].shape[0])
-        ]
-        for key, value in results_dict.items():
-            for i in range(value.shape[0]):
-                setattr(batch_datasampes[i], key, PixelData(data=value[i]))
-
-        _predictions = []
-        for pred in batch_datasampes:
-            if isinstance(pred, BaseDataElement):
-                test_data = pred.to_dict()
-                test_data['img_path'] = \
-                    'tests/data/pseudo_cityscapes_dataset/leftImg8bit/val/\
-                        frankfurt/frankfurt_000000_000294_leftImg8bit.png'
-
-                _predictions.append(test_data)
-            else:
-                _predictions.append(pred)
-        return _predictions
-
-    def test_evaluate(self):
-        """Test using the metric in the same way as Evalutor."""
-
-        data_batch = self._demo_mm_inputs()
-        predictions = self._demo_mm_model_output()
-        iou_metric = CitysMetric(citys_metrics=['cityscapes'])
-        iou_metric.process(data_batch, predictions)
-        res = iou_metric.evaluate(6)
-        self.assertIsInstance(res, dict)
-        # test to_label_id = True
-        iou_metric = CitysMetric(
-            citys_metrics=['cityscapes'], to_label_id=True)
-        iou_metric.process(data_batch, predictions)
-        res = iou_metric.evaluate(6)
-        self.assertIsInstance(res, dict)
-        import shutil
-        shutil.rmtree('.format_cityscapes')
diff --git a/tests/test_metrics/test_iou_metric.py b/tests/test_metrics/test_iou_metric.py
deleted file mode 100644
index 5f4a7522d2..0000000000
--- a/tests/test_metrics/test_iou_metric.py
+++ /dev/null
@@ -1,100 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from unittest import TestCase
-
-import numpy as np
-import torch
-from mmengine.data import BaseDataElement, PixelData
-
-from mmseg.data import SegDataSample
-from mmseg.metrics import IoUMetric
-
-
-class TestIoUMetric(TestCase):
-
-    def _demo_mm_inputs(self,
-                        batch_size=2,
-                        image_shapes=(3, 64, 64),
-                        num_classes=5):
-        """Create a superset of inputs needed to run test or train batches.
-
-        Args:
-            batch_size (int): batch size. Default to 2.
-            image_shapes (List[tuple], Optional): image shape.
-                Default to (3, 64, 64)
-            num_classes (int): number of different classes.
-                Default to 5.
-        """
-        if isinstance(image_shapes, list):
-            assert len(image_shapes) == batch_size
-        else:
-            image_shapes = [image_shapes] * batch_size
-
-        packed_inputs = []
-        for idx in range(batch_size):
-            image_shape = image_shapes[idx]
-            _, h, w = image_shape
-
-            mm_inputs = dict()
-            data_sample = SegDataSample()
-            gt_semantic_seg = np.random.randint(
-                0, num_classes, (1, h, w), dtype=np.uint8)
-            gt_semantic_seg = torch.LongTensor(gt_semantic_seg)
-            gt_sem_seg_data = dict(data=gt_semantic_seg)
-            data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
-            mm_inputs['data_sample'] = data_sample.to_dict()
-            packed_inputs.append(mm_inputs)
-
-        return packed_inputs
-
-    def _demo_mm_model_output(self,
-                              batch_size=2,
-                              image_shapes=(3, 64, 64),
-                              num_classes=5):
-        """Create a superset of inputs needed to run test or train batches.
-
-        Args:
-            batch_size (int): batch size. Default to 2.
-            image_shapes (List[tuple], Optional): image shape.
-                Default to (3, 64, 64)
-            num_classes (int): number of different classes.
-                Default to 5.
-        """
-        results_dict = dict()
-        _, h, w = image_shapes
-        seg_logit = torch.randn(batch_size, num_classes, h, w)
-        results_dict['seg_logits'] = seg_logit
-        seg_pred = np.random.randint(
-            0, num_classes, (batch_size, h, w), dtype=np.uint8)
-        seg_pred = torch.LongTensor(seg_pred)
-        results_dict['pred_sem_seg'] = seg_pred
-
-        batch_datasampes = [
-            SegDataSample()
-            for _ in range(results_dict['pred_sem_seg'].shape[0])
-        ]
-        for key, value in results_dict.items():
-            for i in range(value.shape[0]):
-                setattr(batch_datasampes[i], key, PixelData(data=value[i]))
-
-        _predictions = []
-        for pred in batch_datasampes:
-            if isinstance(pred, BaseDataElement):
-                _predictions.append(pred.to_dict())
-            else:
-                _predictions.append(pred)
-        return _predictions
-
-    def test_evaluate(self):
-        """Test using the metric in the same way as Evalutor."""
-
-        data_batch = self._demo_mm_inputs()
-        predictions = self._demo_mm_model_output()
-
-        iou_metric = IoUMetric(iou_metrics=['mIoU'])
-        iou_metric.dataset_meta = dict(
-            classes=['wall', 'building', 'sky', 'floor', 'tree'],
-            label_map=dict(),
-            reduce_zero_label=False)
-        iou_metric.process(data_batch, predictions)
-        res = iou_metric.evaluate(6)
-        self.assertIsInstance(res, dict)
diff --git a/tests/test_models/test_assigners/test_hungarian_assigner.py b/tests/test_models/test_assigners/test_hungarian_assigner.py
new file mode 100644
index 0000000000..2cdb1de839
--- /dev/null
+++ b/tests/test_models/test_assigners/test_hungarian_assigner.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine.structures import InstanceData
+
+from mmseg.models.assigners import HungarianAssigner
+
+
+class TestHungarianAssigner(TestCase):
+
+    def test_init(self):
+        with self.assertRaises(AssertionError):
+            HungarianAssigner([])
+
+    def test_hungarian_match_assigner(self):
+        assigner = HungarianAssigner([
+            dict(type='ClassificationCost', weight=2.0),
+            dict(type='CrossEntropyLossCost', weight=5.0, use_sigmoid=True),
+            dict(type='DiceCost', weight=5.0, pred_act=True, eps=1.0)
+        ])
+        num_classes = 3
+        num_masks = 10
+        num_points = 20
+        gt_instances = InstanceData()
+        gt_instances.labels = torch.randint(0, num_classes, (num_classes, ))
+        gt_instances.masks = torch.randint(0, 2, (num_classes, num_points))
+        pred_instances = InstanceData()
+        pred_instances.scores = torch.rand((num_masks, num_classes))
+        pred_instances.masks = torch.rand((num_masks, num_points))
+
+        matched_quiery_inds, matched_label_inds = \
+            assigner.assign(pred_instances, gt_instances)
+        unique_quiery_inds = torch.unique(matched_quiery_inds)
+        unique_label_inds = torch.unique(matched_label_inds)
+        self.assertTrue(len(unique_quiery_inds) == len(matched_quiery_inds))
+        self.assertTrue(
+            torch.equal(unique_label_inds, torch.arange(0, num_classes)))
+
+    def test_cls_match_cost(self):
+        num_classes = 3
+        num_masks = 10
+        gt_instances = InstanceData()
+        gt_instances.labels = torch.randint(0, num_classes, (num_classes, ))
+        pred_instances = InstanceData()
+        pred_instances.scores = torch.rand((num_masks, num_classes))
+
+        # test ClassificationCost
+        assigner = HungarianAssigner(dict(type='ClassificationCost'))
+        matched_quiery_inds, matched_label_inds = \
+            assigner.assign(pred_instances, gt_instances)
+        unique_quiery_inds = torch.unique(matched_quiery_inds)
+        unique_label_inds = torch.unique(matched_label_inds)
+        self.assertTrue(len(unique_quiery_inds) == len(matched_quiery_inds))
+        self.assertTrue(
+            torch.equal(unique_label_inds, torch.arange(0, num_classes)))
+
+    def test_mask_match_cost(self):
+        num_classes = 3
+        num_masks = 10
+        num_points = 20
+        gt_instances = InstanceData()
+        gt_instances.masks = torch.randint(0, 2, (num_classes, num_points))
+        pred_instances = InstanceData()
+        pred_instances.masks = torch.rand((num_masks, num_points))
+
+        # test DiceCost
+        assigner = HungarianAssigner(
+            dict(type='DiceCost', pred_act=True, eps=1.0))
+        assign_result = assigner.assign(pred_instances, gt_instances)
+        self.assertTrue(len(assign_result[0]) == len(assign_result[1]))
+
+        # test CrossEntropyLossCost
+        assigner = HungarianAssigner(
+            dict(type='CrossEntropyLossCost', use_sigmoid=True))
+        assign_result = assigner.assign(pred_instances, gt_instances)
+        self.assertTrue(len(assign_result[0]) == len(assign_result[1]))
diff --git a/tests/test_models/test_backbones/__init__.py b/tests/test_models/test_backbones/__init__.py
index 8b673fa5c3..ef101fec61 100644
--- a/tests/test_models/test_backbones/__init__.py
+++ b/tests/test_models/test_backbones/__init__.py
@@ -1,4 +1 @@
 # Copyright (c) OpenMMLab. All rights reserved.
-from .utils import all_zeros, check_norm_state, is_block, is_norm
-
-__all__ = ['is_norm', 'is_block', 'all_zeros', 'check_norm_state']
diff --git a/tests/test_models/test_backbones/test_beit.py b/tests/test_models/test_backbones/test_beit.py
index cf3960894d..59a12c5d09 100644
--- a/tests/test_models/test_backbones/test_beit.py
+++ b/tests/test_models/test_backbones/test_beit.py
@@ -140,8 +140,11 @@ def test_beit_init():
         }
     }
     model = BEiT(img_size=(512, 512))
-    with pytest.raises(AttributeError):
-        model.resize_rel_pos_embed(ckpt)
+    # If scipy is installed, this AttributeError would not be raised.
+    from mmengine.utils import is_installed
+    if not is_installed('scipy'):
+        with pytest.raises(AttributeError):
+            model.resize_rel_pos_embed(ckpt)
 
     # pretrained=None
     # init_cfg=123, whose type is unsupported
diff --git a/tests/test_models/test_backbones/test_blocks.py b/tests/test_models/test_backbones/test_blocks.py
index 77c8564a47..7a65d272cf 100644
--- a/tests/test_models/test_backbones/test_blocks.py
+++ b/tests/test_models/test_backbones/test_blocks.py
@@ -2,7 +2,8 @@
 import mmcv
 import pytest
 import torch
-from mmcv.utils import TORCH_VERSION, digit_version
+from mmengine.utils import digit_version
+from mmengine.utils.dl_utils import TORCH_VERSION
 
 from mmseg.models.utils import (InvertedResidual, InvertedResidualV3, SELayer,
                                 make_divisible)
diff --git a/tests/test_models/test_backbones/test_clip_text_encoder.py b/tests/test_models/test_backbones/test_clip_text_encoder.py
new file mode 100644
index 0000000000..ea06c5b5b3
--- /dev/null
+++ b/tests/test_models/test_backbones/test_clip_text_encoder.py
@@ -0,0 +1,43 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine import Config
+from mmengine.registry import init_default_scope
+
+from mmseg.models.text_encoder import CLIPTextEncoder
+from mmseg.utils import get_classes
+
+
+def test_clip_text_encoder():
+    init_default_scope('mmseg')
+    # test vocabulary
+    output_dims = 8
+    embed_dims = 32
+    vocabulary = ['cat', 'dog', 'bird', 'car', 'bike']
+    cfg = dict(
+        vocabulary=vocabulary,
+        templates=['a photo of a {}.'],
+        embed_dims=embed_dims,
+        output_dims=output_dims)
+    cfg = Config(cfg)
+
+    text_encoder = CLIPTextEncoder(**cfg)
+    if torch.cuda.is_available():
+        text_encoder = text_encoder.cuda()
+
+    with torch.no_grad():
+        class_embeds = text_encoder()
+        assert class_embeds.shape == (len(vocabulary) + 1, output_dims)
+
+    # test dataset name
+    cfg = dict(
+        dataset_name='vaihingen',
+        templates=['a photo of a {}.'],
+        embed_dims=embed_dims,
+        output_dims=output_dims)
+    cfg = Config(cfg)
+
+    text_encoder = CLIPTextEncoder(**cfg)
+    with torch.no_grad():
+        class_embeds = text_encoder()
+        class_nums = len(get_classes('vaihingen'))
+        assert class_embeds.shape == (class_nums + 1, output_dims)
diff --git a/tests/test_models/test_backbones/test_hrnet.py b/tests/test_models/test_backbones/test_hrnet.py
index 8329c84312..3e35515390 100644
--- a/tests/test_models/test_backbones/test_hrnet.py
+++ b/tests/test_models/test_backbones/test_hrnet.py
@@ -1,7 +1,7 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import pytest
 import torch
-from mmcv.utils.parrots_wrapper import _BatchNorm
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 
 from mmseg.models.backbones.hrnet import HRModule, HRNet
 from mmseg.models.backbones.resnet import BasicBlock, Bottleneck
diff --git a/tests/test_models/test_backbones/test_mae.py b/tests/test_models/test_backbones/test_mae.py
index 562d067a7c..16f52b54b4 100644
--- a/tests/test_models/test_backbones/test_mae.py
+++ b/tests/test_models/test_backbones/test_mae.py
@@ -138,8 +138,11 @@ def test_mae_init():
         }
     }
     model = MAE(img_size=(512, 512))
-    with pytest.raises(AttributeError):
-        model.resize_rel_pos_embed(ckpt)
+    # If scipy is installed, this AttributeError would not be raised.
+    from mmengine.utils import is_installed
+    if not is_installed('scipy'):
+        with pytest.raises(AttributeError):
+            model.resize_rel_pos_embed(ckpt)
 
     # test resize abs pos embed
     ckpt = model.resize_abs_pos_embed(ckpt['state_dict'])
diff --git a/tests/test_models/test_backbones/test_mscan.py b/tests/test_models/test_backbones/test_mscan.py
new file mode 100644
index 0000000000..84dfb8e450
--- /dev/null
+++ b/tests/test_models/test_backbones/test_mscan.py
@@ -0,0 +1,69 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmseg.models.backbones import MSCAN
+from mmseg.models.backbones.mscan import (MSCAAttention, MSCASpatialAttention,
+                                          OverlapPatchEmbed, StemConv)
+
+
+def test_mscan_backbone():
+    # Test MSCAN Standard Forward
+    model = MSCAN(
+        embed_dims=[8, 16, 32, 64],
+        norm_cfg=dict(type='BN', requires_grad=True))
+    model.init_weights()
+    model.train()
+    batch_size = 2
+    imgs = torch.randn(batch_size, 3, 64, 128)
+    feat = model(imgs)
+
+    assert len(feat) == 4
+    # output for segment Head
+    assert feat[0].shape == torch.Size([batch_size, 8, 16, 32])
+    assert feat[1].shape == torch.Size([batch_size, 16, 8, 16])
+    assert feat[2].shape == torch.Size([batch_size, 32, 4, 8])
+    assert feat[3].shape == torch.Size([batch_size, 64, 2, 4])
+
+    # Test input with rare shape
+    batch_size = 2
+    imgs = torch.randn(batch_size, 3, 95, 27)
+    feat = model(imgs)
+    assert len(feat) == 4
+
+
+def test_mscan_overlap_patch_embed_module():
+    x_overlap_patch_embed = OverlapPatchEmbed(
+        norm_cfg=dict(type='BN', requires_grad=True))
+    assert x_overlap_patch_embed.proj.in_channels == 3
+    assert x_overlap_patch_embed.norm.weight.shape == torch.Size([768])
+    x = torch.randn(2, 3, 16, 32)
+    x_out, H, W = x_overlap_patch_embed(x)
+    assert x_out.shape == torch.Size([2, 32, 768])
+
+
+def test_mscan_spatial_attention_module():
+    x_spatial_attention = MSCASpatialAttention(8)
+    assert x_spatial_attention.proj_1.kernel_size == (1, 1)
+    assert x_spatial_attention.proj_2.stride == (1, 1)
+    x = torch.randn(2, 8, 16, 32)
+    x_out = x_spatial_attention(x)
+    assert x_out.shape == torch.Size([2, 8, 16, 32])
+
+
+def test_mscan_attention_module():
+    x_attention = MSCAAttention(8)
+    assert x_attention.conv0.weight.shape[0] == 8
+    assert x_attention.conv3.kernel_size == (1, 1)
+    x = torch.randn(2, 8, 16, 32)
+    x_out = x_attention(x)
+    assert x_out.shape == torch.Size([2, 8, 16, 32])
+
+
+def test_mscan_stem_module():
+    x_stem = StemConv(8, 8, norm_cfg=dict(type='BN', requires_grad=True))
+    assert x_stem.proj[0].weight.shape[0] == 4
+    assert x_stem.proj[-1].weight.shape[0] == 8
+    x = torch.randn(2, 8, 16, 32)
+    x_out, H, W = x_stem(x)
+    assert x_out.shape == torch.Size([2, 32, 8])
+    assert (H, W) == (4, 8)
diff --git a/tests/test_models/test_backbones/test_pidnet.py b/tests/test_models/test_backbones/test_pidnet.py
new file mode 100644
index 0000000000..208dfc7814
--- /dev/null
+++ b/tests/test_models/test_backbones/test_pidnet.py
@@ -0,0 +1,87 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import tempfile
+
+import torch
+from mmengine.registry import init_default_scope
+
+from mmseg.registry import MODELS
+
+init_default_scope('mmseg')
+
+
+def test_pidnet_backbone():
+    # Test PIDNet Standard Forward
+    norm_cfg = dict(type='BN', requires_grad=True)
+    backbone_cfg = dict(
+        type='PIDNet',
+        in_channels=3,
+        channels=32,
+        ppm_channels=96,
+        num_stem_blocks=2,
+        num_branch_blocks=3,
+        align_corners=False,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU', inplace=True))
+    model = MODELS.build(backbone_cfg)
+    model.init_weights()
+
+    # Test init weights
+    temp_file = tempfile.NamedTemporaryFile()
+    temp_file.close()
+    torch.save(model.state_dict(), temp_file.name)
+    backbone_cfg.update(
+        init_cfg=dict(type='Pretrained', checkpoint=temp_file.name))
+    model = MODELS.build(backbone_cfg)
+    model.init_weights()
+    os.remove(temp_file.name)
+
+    # Test eval mode
+    model.eval()
+    batch_size = 1
+    imgs = torch.randn(batch_size, 3, 64, 128)
+    feats = model(imgs)
+
+    assert type(feats) == torch.Tensor
+    assert feats.shape == torch.Size([batch_size, 128, 8, 16])
+
+    # Test train mode
+    model.train()
+    batch_size = 2
+    imgs = torch.randn(batch_size, 3, 64, 128)
+    feats = model(imgs)
+
+    assert len(feats) == 3
+    # test output for P branch
+    assert feats[0].shape == torch.Size([batch_size, 64, 8, 16])
+    # test output for I branch
+    assert feats[1].shape == torch.Size([batch_size, 128, 8, 16])
+    # test output for D branch
+    assert feats[2].shape == torch.Size([batch_size, 64, 8, 16])
+
+    # Test pidnet-m
+    backbone_cfg.update(channels=64)
+    model = MODELS.build(backbone_cfg)
+    feats = model(imgs)
+
+    assert len(feats) == 3
+    # test output for P branch
+    assert feats[0].shape == torch.Size([batch_size, 128, 8, 16])
+    # test output for I branch
+    assert feats[1].shape == torch.Size([batch_size, 256, 8, 16])
+    # test output for D branch
+    assert feats[2].shape == torch.Size([batch_size, 128, 8, 16])
+
+    # Test pidnet-l
+    backbone_cfg.update(
+        channels=64, ppm_channesl=112, num_stem_blocks=3, num_branch_blocks=4)
+    model = MODELS.build(backbone_cfg)
+    feats = model(imgs)
+
+    assert len(feats) == 3
+    # test output for P branch
+    assert feats[0].shape == torch.Size([batch_size, 128, 8, 16])
+    # test output for I branch
+    assert feats[1].shape == torch.Size([batch_size, 256, 8, 16])
+    # test output for D branch
+    assert feats[2].shape == torch.Size([batch_size, 128, 8, 16])
diff --git a/tests/test_models/test_backbones/test_resnet.py b/tests/test_models/test_backbones/test_resnet.py
index fa632f5d83..f2f24ba568 100644
--- a/tests/test_models/test_backbones/test_resnet.py
+++ b/tests/test_models/test_backbones/test_resnet.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 from mmcv.ops import DeformConv2dPack
-from mmcv.utils.parrots_wrapper import _BatchNorm
+from mmengine.utils.dl_utils.parrots_wrapper import _BatchNorm
 from torch.nn.modules import AvgPool2d, GroupNorm
 
 from mmseg.models.backbones import ResNet, ResNetV1d
@@ -331,7 +331,7 @@ def test_resnet_backbone():
         for param in layer.parameters():
             assert param.requires_grad is False
     for i in range(1, frozen_stages + 1):
-        layer = getattr(model, 'layer{}'.format(i))
+        layer = getattr(model, f'layer{i}')
         for mod in layer.modules():
             if isinstance(mod, _BatchNorm):
                 assert mod.training is False
@@ -347,7 +347,7 @@ def test_resnet_backbone():
     for param in model.stem.parameters():
         assert param.requires_grad is False
     for i in range(1, frozen_stages + 1):
-        layer = getattr(model, 'layer{}'.format(i))
+        layer = getattr(model, f'layer{i}')
         for mod in layer.modules():
             if isinstance(mod, _BatchNorm):
                 assert mod.training is False
diff --git a/tests/test_models/test_backbones/test_timm_backbone.py b/tests/test_models/test_backbones/test_timm_backbone.py
index 85ef9aa56f..d9a50cf526 100644
--- a/tests/test_models/test_backbones/test_timm_backbone.py
+++ b/tests/test_models/test_backbones/test_timm_backbone.py
@@ -27,7 +27,7 @@ def test_timm_backbone():
         features_only=True,
         pretrained=False,
         output_stride=32,
-        norm_layer='SyncBN')
+        norm_layer='SyncBN2d')
 
     # Test resnet18 from timm, features_only=True, output_stride=32
     model = TIMMBackbone(
diff --git a/tests/test_models/test_backbones/test_unet.py b/tests/test_models/test_backbones/test_unet.py
index 9beb7279a0..4d3faf68cc 100644
--- a/tests/test_models/test_backbones/test_unet.py
+++ b/tests/test_models/test_backbones/test_unet.py
@@ -2,12 +2,15 @@
 import pytest
 import torch
 from mmcv.cnn import ConvModule
+from mmengine.registry import init_default_scope
 
 from mmseg.models.backbones.unet import (BasicConvBlock, DeconvModule,
                                          InterpConv, UNet, UpConvBlock)
-from mmseg.ops import Upsample
+from mmseg.models.utils import Upsample
 from .utils import check_norm_state
 
+init_default_scope('mmseg')
+
 
 def test_unet_basic_conv_block():
     with pytest.raises(AssertionError):
diff --git a/tests/test_models/test_backbones/test_vpd.py b/tests/test_models/test_backbones/test_vpd.py
new file mode 100644
index 0000000000..a268159155
--- /dev/null
+++ b/tests/test_models/test_backbones/test_vpd.py
@@ -0,0 +1,51 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os.path import dirname, join
+from unittest import TestCase
+
+import torch
+from mmengine import Config
+
+import mmseg
+from mmseg.models.backbones import VPD
+
+
+class TestVPD(TestCase):
+
+    def setUp(self) -> None:
+
+        repo_dpath = dirname(dirname(mmseg.__file__))
+        config_dpath = join(repo_dpath, 'configs/_base_/models/vpd_sd.py')
+        vpd_cfg = Config.fromfile(config_dpath).stable_diffusion_cfg
+        vpd_cfg.pop('checkpoint')
+
+        self.vpd_model = VPD(
+            diffusion_cfg=vpd_cfg,
+            class_embed_path='https://download.openmmlab.com/mmsegmentation/'
+            'v0.5/vpd/nyu_class_embeddings.pth',
+            class_embed_select=True,
+            pad_shape=64,
+            unet_cfg=dict(use_attn=False),
+        )
+
+    def test_forward(self):
+        # test forward without class_id
+        x = torch.randn(1, 3, 60, 60)
+        with torch.no_grad():
+            out = self.vpd_model(x)
+
+        self.assertEqual(len(out), 4)
+        self.assertListEqual(list(out[0].shape), [1, 320, 8, 8])
+        self.assertListEqual(list(out[1].shape), [1, 640, 4, 4])
+        self.assertListEqual(list(out[2].shape), [1, 1280, 2, 2])
+        self.assertListEqual(list(out[3].shape), [1, 1280, 1, 1])
+
+        # test forward with class_id
+        x = torch.randn(1, 3, 60, 60)
+        with torch.no_grad():
+            out = self.vpd_model((x, torch.tensor([2])))
+
+        self.assertEqual(len(out), 4)
+        self.assertListEqual(list(out[0].shape), [1, 320, 8, 8])
+        self.assertListEqual(list(out[1].shape), [1, 640, 4, 4])
+        self.assertListEqual(list(out[2].shape), [1, 1280, 2, 2])
+        self.assertListEqual(list(out[3].shape), [1, 1280, 1, 1])
diff --git a/tests/test_models/test_data_preprocessor.py b/tests/test_models/test_data_preprocessor.py
index 4472e43673..d05eef1c7d 100644
--- a/tests/test_models/test_data_preprocessor.py
+++ b/tests/test_models/test_data_preprocessor.py
@@ -2,10 +2,10 @@
 from unittest import TestCase
 
 import torch
-from mmengine.data import PixelData
+from mmengine.structures import PixelData
 
-from mmseg.data import SegDataSample
 from mmseg.models import SegDataPreProcessor
+from mmseg.structures import SegDataSample
 
 
 class TestSegDataPreProcessor(TestCase):
@@ -37,9 +37,28 @@ def test_forward(self):
         processor = SegDataPreProcessor(
             mean=[0, 0, 0], std=[1, 1, 1], size=(20, 20))
         data = {
-            'inputs': torch.randint(0, 256, (3, 11, 10)),
-            'data_sample': data_sample
+            'inputs': [
+                torch.randint(0, 256, (3, 11, 10)),
+                torch.randint(0, 256, (3, 11, 10))
+            ],
+            'data_samples': [data_sample, data_sample]
         }
-        inputs, data_samples = processor([data, data], training=True)
-        self.assertEqual(inputs.shape, (2, 3, 20, 20))
-        self.assertEqual(len(data_samples), 2)
+        out = processor(data, training=True)
+        self.assertEqual(out['inputs'].shape, (2, 3, 20, 20))
+        self.assertEqual(len(out['data_samples']), 2)
+
+        # test predict with padding
+        processor = SegDataPreProcessor(
+            mean=[0, 0, 0],
+            std=[1, 1, 1],
+            size=(20, 20),
+            test_cfg=dict(size_divisor=15))
+        data = {
+            'inputs': [
+                torch.randint(0, 256, (3, 11, 10)),
+            ],
+            'data_samples': [data_sample]
+        }
+        out = processor(data, training=False)
+        self.assertEqual(out['inputs'].shape[2] % 15, 0)
+        self.assertEqual(out['inputs'].shape[3] % 15, 0)
diff --git a/tests/test_models/test_forward.py b/tests/test_models/test_forward.py
index 57b9d31b9f..bb3967f8dd 100644
--- a/tests/test_models/test_forward.py
+++ b/tests/test_models/test_forward.py
@@ -8,15 +8,15 @@
 import pytest
 import torch
 import torch.nn as nn
-from mmcv import is_list_of, is_tuple_of
-from mmcv.cnn.utils import revert_sync_batchnorm
-from mmengine.data import PixelData
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.structures import PixelData
+from mmengine.utils import is_list_of, is_tuple_of
 from torch import Tensor
 
-from mmseg.data import SegDataSample
-from mmseg.utils import register_all_modules
+from mmseg.structures import SegDataSample
 
-register_all_modules()
+init_default_scope('mmseg')
 
 
 def _demo_mm_inputs(batch_size=2, image_shapes=(3, 32, 32), num_classes=5):
@@ -34,14 +34,14 @@ def _demo_mm_inputs(batch_size=2, image_shapes=(3, 32, 32), num_classes=5):
     else:
         image_shapes = [image_shapes] * batch_size
 
-    packed_inputs = []
+    inputs = []
+    data_samples = []
     for idx in range(batch_size):
         image_shape = image_shapes[idx]
         c, h, w = image_shape
         image = np.random.randint(0, 255, size=image_shape, dtype=np.uint8)
 
-        mm_inputs = dict()
-        mm_inputs['inputs'] = torch.from_numpy(image)
+        mm_input = torch.from_numpy(image)
 
         img_meta = {
             'img_id': idx,
@@ -62,10 +62,9 @@ def _demo_mm_inputs(batch_size=2, image_shapes=(3, 32, 32), num_classes=5):
         gt_semantic_seg = torch.LongTensor(gt_semantic_seg)
         gt_sem_seg_data = dict(data=gt_semantic_seg)
         data_sample.gt_sem_seg = PixelData(**gt_sem_seg_data)
-        mm_inputs['data_sample'] = data_sample
-        packed_inputs.append(mm_inputs)
-
-    return packed_inputs
+        inputs.append(mm_input)
+        data_samples.append(data_sample)
+    return dict(inputs=inputs, data_samples=data_samples)
 
 
 def _get_config_directory():
@@ -85,7 +84,7 @@ def _get_config_directory():
 
 def _get_config_module(fname):
     """Load a configuration as a python module."""
-    from mmcv import Config
+    from mmengine import Config
     config_dpath = _get_config_directory()
     config_fpath = join(config_dpath, fname)
     config_mod = Config.fromfile(config_fpath)
@@ -105,90 +104,59 @@ def _get_segmentor_cfg(fname):
 
 def test_pspnet_forward():
     _test_encoder_decoder_forward(
-        'pspnet/pspnet_r18-d8_512x1024_80k_cityscapes.py')
+        'pspnet/pspnet_r18-d8_4xb2-80k_cityscapes-512x1024.py')
 
 
 def test_fcn_forward():
-    _test_encoder_decoder_forward('fcn/fcn_r18-d8_512x1024_80k_cityscapes.py')
+    _test_encoder_decoder_forward(
+        'fcn/fcn_r18-d8_4xb2-80k_cityscapes-512x1024.py')
 
 
 def test_deeplabv3_forward():
     _test_encoder_decoder_forward(
-        'deeplabv3/deeplabv3_r18-d8_512x1024_80k_cityscapes.py')
+        'deeplabv3/deeplabv3_r18-d8_4xb2-80k_cityscapes-512x1024.py')
 
 
 def test_deeplabv3plus_forward():
     _test_encoder_decoder_forward(
-        'deeplabv3plus/deeplabv3plus_r18-d8_512x1024_80k_cityscapes.py')
+        'deeplabv3plus/deeplabv3plus_r18-d8_4xb2-80k_cityscapes-512x1024.py')
 
 
 def test_gcnet_forward():
     _test_encoder_decoder_forward(
-        'gcnet/gcnet_r50-d8_512x1024_40k_cityscapes.py')
-
-
-def test_ann_forward():
-    _test_encoder_decoder_forward('ann/ann_r50-d8_512x1024_40k_cityscapes.py')
+        'gcnet/gcnet_r50-d8_4xb2-40k_cityscapes-512x1024.py')
 
 
 def test_ccnet_forward():
     if not torch.cuda.is_available():
         pytest.skip('CCNet requires CUDA')
     _test_encoder_decoder_forward(
-        'ccnet/ccnet_r50-d8_512x1024_40k_cityscapes.py')
-
-
-def test_danet_forward():
-    _test_encoder_decoder_forward(
-        'danet/danet_r50-d8_512x1024_40k_cityscapes.py')
-
-
-def test_nonlocal_net_forward():
-    _test_encoder_decoder_forward(
-        'nonlocal_net/nonlocal_r50-d8_512x1024_40k_cityscapes.py')
+        'ccnet/ccnet_r50-d8_4xb2-40k_cityscapes-512x1024.py')
 
 
 def test_upernet_forward():
     _test_encoder_decoder_forward(
-        'upernet/upernet_r50_512x1024_40k_cityscapes.py')
+        'upernet/upernet_r50_4xb2-40k_cityscapes-512x1024.py')
 
 
 def test_hrnet_forward():
-    _test_encoder_decoder_forward('hrnet/fcn_hr18s_512x1024_40k_cityscapes.py')
-
-
-def test_ocrnet_forward():
     _test_encoder_decoder_forward(
-        'ocrnet/ocrnet_hr18s_512x1024_40k_cityscapes.py')
+        'hrnet/fcn_hr18s_4xb2-40k_cityscapes-512x1024.py')
 
 
-def test_psanet_forward():
+def test_ocrnet_forward():
     _test_encoder_decoder_forward(
-        'psanet/psanet_r50-d8_512x1024_40k_cityscapes.py')
+        'ocrnet/ocrnet_hr18s_4xb2-40k_cityscapes-512x1024.py')
 
 
 def test_sem_fpn_forward():
-    _test_encoder_decoder_forward('sem_fpn/fpn_r50_512x1024_80k_cityscapes.py')
-
-
-def test_mobilenet_v2_forward():
-    _test_encoder_decoder_forward(
-        'mobilenet_v2/pspnet_m-v2-d8_512x1024_80k_cityscapes.py')
-
-
-def test_dnlnet_forward():
     _test_encoder_decoder_forward(
-        'dnlnet/dnl_r50-d8_512x1024_40k_cityscapes.py')
+        'sem_fpn/fpn_r50_4xb2-80k_cityscapes-512x1024.py')
 
 
-def test_emanet_forward():
-    _test_encoder_decoder_forward(
-        'emanet/emanet_r50-d8_512x1024_80k_cityscapes.py')
-
-
-def test_isanet_forward():
+def test_mobilenet_v2_forward():
     _test_encoder_decoder_forward(
-        'isanet/isanet_r50-d8_512x1024_40k_cityscapes.py')
+        'mobilenet_v2/mobilenet-v2-d8_pspnet_4xb2-80k_cityscapes-512x1024.py')
 
 
 def get_world_size(process_group):
@@ -218,7 +186,7 @@ def _test_encoder_decoder_forward(cfg_file):
         num_classes = segmentor.decode_head.num_classes
     # batch_size=2 for BatchNorm
     packed_inputs = _demo_mm_inputs(
-        batch_size=2, image_shapes=(3, 32, 32), num_classes=num_classes)
+        batch_size=2, image_shapes=(3, 4, 4), num_classes=num_classes)
     # convert to cuda Tensor if applicable
     if torch.cuda.is_available():
         segmentor = segmentor.cuda()
@@ -226,27 +194,36 @@ def _test_encoder_decoder_forward(cfg_file):
         segmentor = revert_sync_batchnorm(segmentor)
 
     # Test forward train
-    batch_inputs, data_samples = segmentor.data_preprocessor(
-        packed_inputs, True)
-    losses = segmentor.forward(batch_inputs, data_samples, mode='loss')
+    data = segmentor.data_preprocessor(packed_inputs, True)
+    losses = segmentor.forward(**data, mode='loss')
     assert isinstance(losses, dict)
 
     packed_inputs = _demo_mm_inputs(
         batch_size=1, image_shapes=(3, 32, 32), num_classes=num_classes)
-    batch_inputs, data_samples = segmentor.data_preprocessor(
-        packed_inputs, False)
+    data = segmentor.data_preprocessor(packed_inputs, False)
     with torch.no_grad():
         segmentor.eval()
         # Test forward predict
-        batch_results = segmentor.forward(
-            batch_inputs, data_samples, mode='predict')
+        batch_results = segmentor.forward(**data, mode='predict')
         assert len(batch_results) == 1
         assert is_list_of(batch_results, SegDataSample)
         assert batch_results[0].pred_sem_seg.shape == (32, 32)
         assert batch_results[0].seg_logits.data.shape == (num_classes, 32, 32)
+        assert batch_results[0].gt_sem_seg.shape == (32, 32)
 
         # Test forward tensor
-        batch_results = segmentor.forward(
-            batch_inputs, data_samples, mode='tensor')
+        batch_results = segmentor.forward(**data, mode='tensor')
+        assert isinstance(batch_results, Tensor) or is_tuple_of(
+            batch_results, Tensor)
+
+        # Test forward predict without ground truth
+        data.pop('data_samples')
+        batch_results = segmentor.forward(**data, mode='predict')
+        assert len(batch_results) == 1
+        assert is_list_of(batch_results, SegDataSample)
+        assert batch_results[0].pred_sem_seg.shape == (32, 32)
+
+        # Test forward tensor without ground truth
+        batch_results = segmentor.forward(**data, mode='tensor')
         assert isinstance(batch_results, Tensor) or is_tuple_of(
             batch_results, Tensor)
diff --git a/tests/test_models/test_heads/test_decode_head.py b/tests/test_models/test_heads/test_decode_head.py
new file mode 100644
index 0000000000..88e6bed10f
--- /dev/null
+++ b/tests/test_models/test_heads/test_decode_head.py
@@ -0,0 +1,193 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest.mock import patch
+
+import pytest
+import torch
+from mmengine.structures import PixelData
+
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.structures import SegDataSample
+from .utils import to_cuda
+
+
+@patch.multiple(BaseDecodeHead, __abstractmethods__=set())
+def test_decode_head():
+
+    with pytest.raises(AssertionError):
+        # default input_transform doesn't accept multiple inputs
+        BaseDecodeHead([32, 16], 16, num_classes=19)
+
+    with pytest.raises(AssertionError):
+        # default input_transform doesn't accept multiple inputs
+        BaseDecodeHead(32, 16, num_classes=19, in_index=[-1, -2])
+
+    with pytest.raises(AssertionError):
+        # supported mode is resize_concat only
+        BaseDecodeHead(32, 16, num_classes=19, input_transform='concat')
+
+    with pytest.raises(AssertionError):
+        # in_channels should be list|tuple
+        BaseDecodeHead(32, 16, num_classes=19, input_transform='resize_concat')
+
+    with pytest.raises(AssertionError):
+        # in_index should be list|tuple
+        BaseDecodeHead([32],
+                       16,
+                       in_index=-1,
+                       num_classes=19,
+                       input_transform='resize_concat')
+
+    with pytest.raises(AssertionError):
+        # len(in_index) should equal len(in_channels)
+        BaseDecodeHead([32, 16],
+                       16,
+                       num_classes=19,
+                       in_index=[-1],
+                       input_transform='resize_concat')
+
+    with pytest.raises(ValueError):
+        # out_channels should be equal to num_classes
+        BaseDecodeHead(32, 16, num_classes=19, out_channels=18)
+
+    # test out_channels
+    head = BaseDecodeHead(32, 16, num_classes=2)
+    assert head.out_channels == 2
+
+    # test out_channels == 1 and num_classes == 2
+    head = BaseDecodeHead(32, 16, num_classes=2, out_channels=1)
+    assert head.out_channels == 1 and head.num_classes == 2
+
+    # test default dropout
+    head = BaseDecodeHead(32, 16, num_classes=19)
+    assert hasattr(head, 'dropout') and head.dropout.p == 0.1
+
+    # test set dropout
+    head = BaseDecodeHead(32, 16, num_classes=19, dropout_ratio=0.2)
+    assert hasattr(head, 'dropout') and head.dropout.p == 0.2
+
+    # test no input_transform
+    inputs = [torch.randn(1, 32, 45, 45)]
+    head = BaseDecodeHead(32, 16, num_classes=19)
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+    assert head.in_channels == 32
+    assert head.input_transform is None
+    transformed_inputs = head._transform_inputs(inputs)
+    assert transformed_inputs.shape == (1, 32, 45, 45)
+
+    # test input_transform = resize_concat
+    inputs = [torch.randn(1, 32, 45, 45), torch.randn(1, 16, 21, 21)]
+    head = BaseDecodeHead([32, 16],
+                          16,
+                          num_classes=19,
+                          in_index=[0, 1],
+                          input_transform='resize_concat')
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+    assert head.in_channels == 48
+    assert head.input_transform == 'resize_concat'
+    transformed_inputs = head._transform_inputs(inputs)
+    assert transformed_inputs.shape == (1, 48, 45, 45)
+
+    # test multi-loss, loss_decode is dict
+    with pytest.raises(TypeError):
+        # loss_decode must be a dict or sequence of dict.
+        BaseDecodeHead(3, 16, num_classes=19, loss_decode=['CrossEntropyLoss'])
+
+    inputs = torch.randn(2, 19, 8, 8).float()
+    data_samples = [
+        SegDataSample(gt_sem_seg=PixelData(data=torch.ones(64, 64).long()))
+        for _ in range(2)
+    ]
+
+    head = BaseDecodeHead(
+        3,
+        16,
+        num_classes=19,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0))
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+    loss = head.loss_by_feat(
+        seg_logits=inputs, batch_data_samples=data_samples)
+    assert 'loss_ce' in loss
+
+    # test multi-loss, loss_decode is list of dict
+    inputs = torch.randn(2, 19, 8, 8).float()
+    data_samples = [
+        SegDataSample(gt_sem_seg=PixelData(data=torch.ones(64, 64).long()))
+        for _ in range(2)
+    ]
+    head = BaseDecodeHead(
+        3,
+        16,
+        num_classes=19,
+        loss_decode=[
+            dict(type='CrossEntropyLoss', loss_name='loss_1'),
+            dict(type='CrossEntropyLoss', loss_name='loss_2')
+        ])
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+
+    loss = head.loss_by_feat(
+        seg_logits=inputs, batch_data_samples=data_samples)
+    assert 'loss_1' in loss
+    assert 'loss_2' in loss
+
+    # 'loss_decode' must be a dict or sequence of dict
+    with pytest.raises(TypeError):
+        BaseDecodeHead(3, 16, num_classes=19, loss_decode=['CrossEntropyLoss'])
+    with pytest.raises(TypeError):
+        BaseDecodeHead(3, 16, num_classes=19, loss_decode=0)
+
+    # test multi-loss, loss_decode is list of dict
+    inputs = torch.randn(2, 19, 8, 8).float()
+    data_samples = [
+        SegDataSample(gt_sem_seg=PixelData(data=torch.ones(64, 64).long()))
+        for _ in range(2)
+    ]
+    head = BaseDecodeHead(
+        3,
+        16,
+        num_classes=19,
+        loss_decode=(dict(type='CrossEntropyLoss', loss_name='loss_1'),
+                     dict(type='CrossEntropyLoss', loss_name='loss_2'),
+                     dict(type='CrossEntropyLoss', loss_name='loss_3')))
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+    loss = head.loss_by_feat(
+        seg_logits=inputs, batch_data_samples=data_samples)
+    assert 'loss_1' in loss
+    assert 'loss_2' in loss
+    assert 'loss_3' in loss
+
+    # test multi-loss, loss_decode is list of dict, names of them are identical
+    inputs = torch.randn(2, 19, 8, 8).float()
+    data_samples = [
+        SegDataSample(gt_sem_seg=PixelData(data=torch.ones(64, 64).long()))
+        for _ in range(2)
+    ]
+    head = BaseDecodeHead(
+        3,
+        16,
+        num_classes=19,
+        loss_decode=(dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+                     dict(type='CrossEntropyLoss', loss_name='loss_ce'),
+                     dict(type='CrossEntropyLoss', loss_name='loss_ce')))
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+    loss_3 = head.loss_by_feat(
+        seg_logits=inputs, batch_data_samples=data_samples)
+
+    head = BaseDecodeHead(
+        3,
+        16,
+        num_classes=19,
+        loss_decode=(dict(type='CrossEntropyLoss', loss_name='loss_ce')))
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+    loss = head.loss_by_feat(
+        seg_logits=inputs, batch_data_samples=data_samples)
+    assert 'loss_ce' in loss
+    assert 'loss_ce' in loss_3
+    assert loss_3['loss_ce'] == 3 * loss['loss_ce']
diff --git a/tests/test_models/test_heads/test_fcn_head.py b/tests/test_models/test_heads/test_fcn_head.py
index 4e633fba48..664b543e07 100644
--- a/tests/test_models/test_heads/test_fcn_head.py
+++ b/tests/test_models/test_heads/test_fcn_head.py
@@ -2,7 +2,7 @@
 import pytest
 import torch
 from mmcv.cnn import ConvModule, DepthwiseSeparableConvModule
-from mmcv.utils.parrots_wrapper import SyncBatchNorm
+from mmengine.utils.dl_utils.parrots_wrapper import SyncBatchNorm
 
 from mmseg.models.decode_heads import DepthwiseSeparableFCNHead, FCNHead
 from .utils import to_cuda
diff --git a/tests/test_models/test_heads/test_ham_head.py b/tests/test_models/test_heads/test_ham_head.py
new file mode 100644
index 0000000000..f802d2d8db
--- /dev/null
+++ b/tests/test_models/test_heads/test_ham_head.py
@@ -0,0 +1,44 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmseg.models.decode_heads import LightHamHead
+from .utils import _conv_has_norm, to_cuda
+
+ham_norm_cfg = dict(type='GN', num_groups=32, requires_grad=True)
+
+
+def test_ham_head():
+
+    # test without sync_bn
+    head = LightHamHead(
+        in_channels=[16, 32, 64],
+        in_index=[1, 2, 3],
+        channels=64,
+        ham_channels=64,
+        dropout_ratio=0.1,
+        num_classes=19,
+        norm_cfg=ham_norm_cfg,
+        align_corners=False,
+        loss_decode=dict(
+            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
+        ham_kwargs=dict(
+            MD_S=1,
+            MD_R=64,
+            train_steps=6,
+            eval_steps=7,
+            inv_t=100,
+            rand_init=True))
+    assert not _conv_has_norm(head, sync_bn=False)
+
+    inputs = [
+        torch.randn(1, 8, 32, 32),
+        torch.randn(1, 16, 16, 16),
+        torch.randn(1, 32, 8, 8),
+        torch.randn(1, 64, 4, 4)
+    ]
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+    assert head.in_channels == [16, 32, 64]
+    assert head.hamburger.ham_in.in_channels == 64
+    outputs = head(inputs)
+    assert outputs.shape == (1, head.num_classes, 16, 16)
diff --git a/tests/test_models/test_heads/test_mask2former_head.py b/tests/test_models/test_heads/test_mask2former_head.py
new file mode 100644
index 0000000000..45b353d441
--- /dev/null
+++ b/tests/test_models/test_heads/test_mask2former_head.py
@@ -0,0 +1,153 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine import Config
+from mmengine.structures import PixelData
+
+from mmseg.models.decode_heads import Mask2FormerHead
+from mmseg.structures import SegDataSample
+from mmseg.utils import SampleList
+from .utils import to_cuda
+
+
+def test_mask2former_head():
+    num_classes = 19
+    cfg = dict(
+        in_channels=[96, 192, 384, 768],
+        strides=[4, 8, 16, 32],
+        feat_channels=256,
+        out_channels=256,
+        num_classes=num_classes,
+        num_queries=100,
+        num_transformer_feat_level=3,
+        align_corners=False,
+        pixel_decoder=dict(
+            type='mmdet.MSDeformAttnPixelDecoder',
+            num_outs=3,
+            norm_cfg=dict(type='GN', num_groups=32),
+            act_cfg=dict(type='ReLU'),
+            encoder=dict(  # DeformableDetrTransformerEncoder
+                num_layers=6,
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
+                        embed_dims=256,
+                        num_heads=8,
+                        num_levels=3,
+                        num_points=4,
+                        im2col_step=64,
+                        dropout=0.0,
+                        batch_first=True,
+                        norm_cfg=None,
+                        init_cfg=None),
+                    ffn_cfg=dict(
+                        embed_dims=256,
+                        feedforward_channels=1024,
+                        num_fcs=2,
+                        ffn_drop=0.0,
+                        act_cfg=dict(type='ReLU', inplace=True))),
+                init_cfg=None),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
+            init_cfg=None),
+        enforce_decoder_input_project=False,
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
+            return_intermediate=True,
+            num_layers=9,
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
+                    embed_dims=256,
+                    feedforward_channels=2048,
+                    num_fcs=2,
+                    act_cfg=dict(type='ReLU', inplace=True),
+                    ffn_drop=0.0,
+                    dropout_layer=None,
+                    add_identity=True)),
+            init_cfg=None),
+        loss_cls=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=False,
+            loss_weight=2.0,
+            reduction='mean',
+            class_weight=[1.0] * num_classes + [0.1]),
+        loss_mask=dict(
+            type='mmdet.CrossEntropyLoss',
+            use_sigmoid=True,
+            reduction='mean',
+            loss_weight=5.0),
+        loss_dice=dict(
+            type='mmdet.DiceLoss',
+            use_sigmoid=True,
+            activate=True,
+            reduction='mean',
+            naive_dice=True,
+            eps=1.0,
+            loss_weight=5.0),
+        train_cfg=dict(
+            num_points=12544,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='mmdet.HungarianAssigner',
+                match_costs=[
+                    dict(type='mmdet.ClassificationCost', weight=2.0),
+                    dict(
+                        type='mmdet.CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(
+                        type='mmdet.DiceCost',
+                        weight=5.0,
+                        pred_act=True,
+                        eps=1.0)
+                ]),
+            sampler=dict(type='mmdet.MaskPseudoSampler')))
+    cfg = Config(cfg)
+    head = Mask2FormerHead(**cfg)
+
+    inputs = [
+        torch.rand((2, 96, 8, 8)),
+        torch.rand((2, 192, 4, 4)),
+        torch.rand((2, 384, 2, 2)),
+        torch.rand((2, 768, 1, 1))
+    ]
+
+    data_samples: SampleList = []
+    for i in range(2):
+        data_sample = SegDataSample()
+        img_meta = {}
+        img_meta['img_shape'] = (32, 32)
+        img_meta['ori_shape'] = (32, 32)
+        data_sample.gt_sem_seg = PixelData(
+            data=torch.randint(0, num_classes, (1, 32, 32)))
+        data_sample.set_metainfo(img_meta)
+        data_samples.append(data_sample)
+
+    if torch.cuda.is_available():
+        head, inputs = to_cuda(head, inputs)
+        for data_sample in data_samples:
+            data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.data.cuda()
+
+    loss_dict = head.loss(inputs, data_samples, None)
+    assert isinstance(loss_dict, dict)
+
+    batch_img_metas = []
+    for data_sample in data_samples:
+        batch_img_metas.append(data_sample.metainfo)
+
+    seg_logits = head.predict(inputs, batch_img_metas, None)
+    assert seg_logits.shape == torch.Size((2, num_classes, 32, 32))
diff --git a/tests/test_models/test_heads/test_maskformer_head.py b/tests/test_models/test_heads/test_maskformer_head.py
new file mode 100644
index 0000000000..6a47239b03
--- /dev/null
+++ b/tests/test_models/test_heads/test_maskformer_head.py
@@ -0,0 +1,54 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from os.path import dirname, join
+
+import torch
+from mmengine import Config
+from mmengine.registry import init_default_scope
+from mmengine.structures import PixelData
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+
+
+def test_maskformer_head():
+    init_default_scope('mmseg')
+    repo_dpath = dirname(dirname(__file__))
+    cfg = Config.fromfile(
+        join(
+            repo_dpath,
+            '../../configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py'  # noqa
+        ))
+    cfg.model.train_cfg = None
+    decode_head = MODELS.build(cfg.model.decode_head)
+    inputs = (torch.randn(1, 256, 32, 32), torch.randn(1, 512, 16, 16),
+              torch.randn(1, 1024, 8, 8), torch.randn(1, 2048, 4, 4))
+    # test inference
+    batch_img_metas = [
+        dict(
+            scale_factor=(1.0, 1.0),
+            img_shape=(512, 683),
+            ori_shape=(512, 683))
+    ]
+    test_cfg = dict(mode='whole')
+    output = decode_head.predict(inputs, batch_img_metas, test_cfg)
+    assert output.shape == (1, 150, 512, 683)
+
+    # test training
+    inputs = (torch.randn(2, 256, 32, 32), torch.randn(2, 512, 16, 16),
+              torch.randn(2, 1024, 8, 8), torch.randn(2, 2048, 4, 4))
+    batch_data_samples = []
+    img_meta = {
+        'img_shape': (512, 512),
+        'ori_shape': (480, 640),
+        'pad_shape': (512, 512),
+        'scale_factor': (1.425, 1.425),
+    }
+    for _ in range(2):
+        data_sample = SegDataSample(
+            gt_sem_seg=PixelData(data=torch.ones(512, 512).long()))
+        data_sample.set_metainfo(img_meta)
+        batch_data_samples.append(data_sample)
+    train_cfg = {}
+    losses = decode_head.loss(inputs, batch_data_samples, train_cfg)
+    assert (loss in losses.keys()
+            for loss in ('loss_cls', 'loss_mask', 'loss_dice'))
diff --git a/tests/test_models/test_heads/test_pidnet_head.py b/tests/test_models/test_heads/test_pidnet_head.py
new file mode 100644
index 0000000000..a6247371c5
--- /dev/null
+++ b/tests/test_models/test_heads/test_pidnet_head.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.registry import init_default_scope
+
+from mmseg.registry import MODELS
+
+
+def test_pidnet_head():
+    init_default_scope('mmseg')
+
+    # Test PIDNet decode head Standard Forward
+    norm_cfg = dict(type='BN', requires_grad=True)
+    backbone_cfg = dict(
+        type='PIDNet',
+        in_channels=3,
+        channels=32,
+        ppm_channels=96,
+        num_stem_blocks=2,
+        num_branch_blocks=3,
+        align_corners=False,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU', inplace=True))
+    decode_head_cfg = dict(
+        type='PIDHead',
+        in_channels=128,
+        channels=128,
+        num_classes=19,
+        norm_cfg=norm_cfg,
+        act_cfg=dict(type='ReLU', inplace=True),
+        align_corners=True,
+        loss_decode=[
+            dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=False,
+                class_weight=[
+                    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754,
+                    1.0489, 0.8786, 1.0023, 0.9539, 0.9843, 1.1116, 0.9037,
+                    1.0865, 1.0955, 1.0865, 1.1529, 1.0507
+                ],
+                loss_weight=0.4),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=[
+                    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754,
+                    1.0489, 0.8786, 1.0023, 0.9539, 0.9843, 1.1116, 0.9037,
+                    1.0865, 1.0955, 1.0865, 1.1529, 1.0507
+                ],
+                loss_weight=1.0),
+            dict(type='BoundaryLoss', loss_weight=20.0),
+            dict(
+                type='OhemCrossEntropy',
+                thres=0.9,
+                min_kept=131072,
+                class_weight=[
+                    0.8373, 0.918, 0.866, 1.0345, 1.0166, 0.9969, 0.9754,
+                    1.0489, 0.8786, 1.0023, 0.9539, 0.9843, 1.1116, 0.9037,
+                    1.0865, 1.0955, 1.0865, 1.1529, 1.0507
+                ],
+                loss_weight=1.0)
+        ])
+    backbone = MODELS.build(backbone_cfg)
+    head = MODELS.build(decode_head_cfg)
+
+    # Test train mode
+    backbone.train()
+    head.train()
+    batch_size = 2
+    imgs = torch.randn(batch_size, 3, 64, 128)
+    feats = backbone(imgs)
+    seg_logit = head(feats)
+
+    assert isinstance(seg_logit, tuple)
+    assert len(seg_logit) == 3
+
+    p_logits, i_logits, d_logits = seg_logit
+    assert p_logits.shape == (batch_size, 19, 8, 16)
+    assert i_logits.shape == (batch_size, 19, 8, 16)
+    assert d_logits.shape == (batch_size, 1, 8, 16)
+
+    # Test eval mode
+    backbone.eval()
+    head.eval()
+    feats = backbone(imgs)
+    seg_logit = head(feats)
+
+    assert isinstance(seg_logit, torch.Tensor)
+    assert seg_logit.shape == (batch_size, 19, 8, 16)
diff --git a/tests/test_models/test_heads/test_san_head.py b/tests/test_models/test_heads/test_san_head.py
new file mode 100644
index 0000000000..af85a6e2ca
--- /dev/null
+++ b/tests/test_models/test_heads/test_san_head.py
@@ -0,0 +1,126 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine import Config
+from mmengine.structures import PixelData
+
+from mmseg.models.decode_heads import SideAdapterCLIPHead
+from mmseg.structures import SegDataSample
+from .utils import list_to_cuda
+
+
+def test_san_head():
+    H, W = (64, 64)
+    clip_channels = 64
+    img_channels = 4
+    num_queries = 40
+    out_dims = 64
+    num_classes = 19
+    cfg = dict(
+        num_classes=num_classes,
+        deep_supervision_idxs=[4],
+        san_cfg=dict(
+            in_channels=img_channels,
+            embed_dims=128,
+            clip_channels=clip_channels,
+            num_queries=num_queries,
+            cfg_encoder=dict(num_encode_layer=4, mlp_ratio=2, num_heads=2),
+            cfg_decoder=dict(
+                num_heads=4,
+                num_layers=1,
+                embed_channels=32,
+                mlp_channels=32,
+                num_mlp=2,
+                rescale=True)),
+        maskgen_cfg=dict(
+            sos_token_num=num_queries,
+            embed_dims=clip_channels,
+            out_dims=out_dims,
+            num_heads=4,
+            mlp_ratio=2),
+        train_cfg=dict(
+            num_points=100,
+            oversample_ratio=3.0,
+            importance_sample_ratio=0.75,
+            assigner=dict(
+                type='HungarianAssigner',
+                match_costs=[
+                    dict(type='ClassificationCost', weight=2.0),
+                    dict(
+                        type='CrossEntropyLossCost',
+                        weight=5.0,
+                        use_sigmoid=True),
+                    dict(type='DiceCost', weight=5.0, pred_act=True, eps=1.0)
+                ])),
+        loss_decode=[
+            dict(
+                type='CrossEntropyLoss',
+                loss_name='loss_cls_ce',
+                loss_weight=2.0,
+                class_weight=[1.0] * num_classes + [0.1]),
+            dict(
+                type='CrossEntropyLoss',
+                use_sigmoid=True,
+                loss_name='loss_mask_ce',
+                loss_weight=5.0),
+            dict(
+                type='DiceLoss',
+                ignore_index=None,
+                naive_dice=True,
+                eps=1,
+                loss_name='loss_mask_dice',
+                loss_weight=5.0)
+        ])
+
+    cfg = Config(cfg)
+    head = SideAdapterCLIPHead(**cfg)
+
+    inputs = torch.rand((2, img_channels, H, W))
+    clip_feature = [[
+        torch.rand((2, clip_channels, H // 2, W // 2)),
+        torch.rand((2, clip_channels))
+    ],
+                    [
+                        torch.rand((2, clip_channels, H // 2, W // 2)),
+                        torch.rand((2, clip_channels))
+                    ],
+                    [
+                        torch.rand((2, clip_channels, H // 2, W // 2)),
+                        torch.rand((2, clip_channels))
+                    ],
+                    [
+                        torch.rand((2, clip_channels, H // 2, W // 2)),
+                        torch.rand((2, clip_channels))
+                    ]]
+    class_embed = torch.rand((num_classes + 1, out_dims))
+
+    data_samples = []
+    for i in range(2):
+        data_sample = SegDataSample()
+        img_meta = {}
+        img_meta['img_shape'] = (H, W)
+        img_meta['ori_shape'] = (H, W)
+        data_sample.gt_sem_seg = PixelData(
+            data=torch.randint(0, num_classes, (1, H, W)))
+        data_sample.set_metainfo(img_meta)
+        data_samples.append(data_sample)
+
+    batch_img_metas = []
+    for data_sample in data_samples:
+        batch_img_metas.append(data_sample.metainfo)
+
+    if torch.cuda.is_available():
+        head = head.cuda()
+        data = list_to_cuda([inputs, clip_feature, class_embed])
+        for data_sample in data_samples:
+            data_sample.gt_sem_seg.data = data_sample.gt_sem_seg.data.cuda()
+    else:
+        data = [inputs, clip_feature, class_embed]
+
+    # loss test
+    loss_dict = head.loss(data, data_samples, None)
+    assert isinstance(loss_dict, dict)
+
+    # prediction test
+    with torch.no_grad():
+        seg_logits = head.predict(data, batch_img_metas, None)
+    assert seg_logits.shape == torch.Size((2, num_classes, H, W))
diff --git a/tests/test_models/test_heads/test_vpd_depth_head.py b/tests/test_models/test_heads/test_vpd_depth_head.py
new file mode 100644
index 0000000000..e3a4f7558e
--- /dev/null
+++ b/tests/test_models/test_heads/test_vpd_depth_head.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+from mmengine.structures import PixelData
+
+from mmseg.models.decode_heads import VPDDepthHead
+from mmseg.structures import SegDataSample
+
+
+class TestVPDDepthHead(TestCase):
+
+    def setUp(self):
+        """Set up common resources."""
+        self.in_channels = [320, 640, 1280, 1280]
+        self.max_depth = 10.0
+        self.loss_decode = dict(
+            type='SiLogLoss'
+        )  # Replace with your actual loss type and parameters
+        self.vpd_depth_head = VPDDepthHead(
+            max_depth=self.max_depth,
+            in_channels=self.in_channels,
+            loss_decode=self.loss_decode)
+
+    def test_forward(self):
+        """Test the forward method."""
+        # Create a mock input tensor. Replace shape as per your needs.
+        x = [
+            torch.randn(1, 320, 32, 32),
+            torch.randn(1, 640, 16, 16),
+            torch.randn(1, 1280, 8, 8),
+            torch.randn(1, 1280, 4, 4)
+        ]
+
+        output = self.vpd_depth_head.forward(x)
+        print(output.shape)
+
+        self.assertEqual(output.shape, (1, 1, 256, 256))
+
+    def test_loss_by_feat(self):
+        """Test the loss_by_feat method."""
+        # Create mock data for `pred_depth_map` and `batch_data_samples`.
+        pred_depth_map = torch.randn(1, 1, 32, 32)
+        gt_depth_map = PixelData(data=torch.rand(1, 32, 32))
+        batch_data_samples = [SegDataSample(gt_depth_map=gt_depth_map)]
+
+        loss = self.vpd_depth_head.loss_by_feat(pred_depth_map,
+                                                batch_data_samples)
+
+        self.assertIsNotNone(loss)
diff --git a/tests/test_models/test_heads/utils.py b/tests/test_models/test_heads/utils.py
index 675241c21c..7282340155 100644
--- a/tests/test_models/test_heads/utils.py
+++ b/tests/test_models/test_heads/utils.py
@@ -1,6 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 from mmcv.cnn import ConvModule
-from mmcv.utils.parrots_wrapper import SyncBatchNorm
+from mmengine.utils.dl_utils.parrots_wrapper import SyncBatchNorm
 
 
 def _conv_has_norm(module, sync_bn):
@@ -20,3 +20,12 @@ def to_cuda(module, data):
         for i in range(len(data)):
             data[i] = data[i].cuda()
     return module, data
+
+
+def list_to_cuda(data):
+    if isinstance(data, list):
+        for i in range(len(data)):
+            data[i] = list_to_cuda(data[i])
+        return data
+    else:
+        return data.cuda()
diff --git a/tests/test_models/test_losses/test_cross_entropy_loss.py b/tests/test_models/test_losses/test_cross_entropy_loss.py
new file mode 100644
index 0000000000..8c6b86d014
--- /dev/null
+++ b/tests/test_models/test_losses/test_cross_entropy_loss.py
@@ -0,0 +1,28 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+import torch.nn.functional as F
+
+from mmseg.models.losses import CrossEntropyLoss, weight_reduce_loss
+
+
+def test_cross_entropy_loss_class_weights():
+    loss_class = CrossEntropyLoss
+    pred = torch.rand((1, 10, 4, 4))
+    target = torch.randint(0, 10, (1, 4, 4))
+    class_weight = torch.ones(10)
+    avg_factor = target.numel()
+
+    cross_entropy_loss = F.cross_entropy(
+        pred, target, weight=class_weight, reduction='none', ignore_index=-100)
+
+    expected_loss = weight_reduce_loss(
+        cross_entropy_loss,
+        weight=None,
+        reduction='mean',
+        avg_factor=avg_factor)
+
+    # Test loss forward
+    loss = loss_class(class_weight=class_weight.tolist())(pred, target)
+
+    assert isinstance(loss, torch.Tensor)
+    assert expected_loss == loss
diff --git a/tests/test_models/test_losses/test_dice_loss.py b/tests/test_models/test_losses/test_dice_loss.py
new file mode 100644
index 0000000000..34253dae12
--- /dev/null
+++ b/tests/test_models/test_losses/test_dice_loss.py
@@ -0,0 +1,96 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmseg.models.losses import DiceLoss
+
+
+@pytest.mark.parametrize('naive_dice', [True, False])
+def test_dice_loss(naive_dice):
+    loss_class = DiceLoss
+    pred = torch.rand((1, 10, 4, 4))
+    target = torch.randint(0, 10, (1, 4, 4))
+    weight = torch.rand(1)
+    # Test loss forward
+    loss = loss_class(naive_dice=naive_dice)(pred, target)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with weight
+    loss = loss_class(naive_dice=naive_dice)(pred, target, weight)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with reduction_override
+    loss = loss_class(naive_dice=naive_dice)(
+        pred, target, reduction_override='mean')
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with avg_factor
+    loss = loss_class(naive_dice=naive_dice)(pred, target, avg_factor=10)
+    assert isinstance(loss, torch.Tensor)
+
+    with pytest.raises(ValueError):
+        # loss can evaluate with avg_factor only if
+        # reduction is None, 'none' or 'mean'.
+        reduction_override = 'sum'
+        loss_class(naive_dice=naive_dice)(
+            pred, target, avg_factor=10, reduction_override=reduction_override)
+
+    # Test loss forward with avg_factor and reduction
+    for reduction_override in [None, 'none', 'mean']:
+        loss_class(naive_dice=naive_dice)(
+            pred, target, avg_factor=10, reduction_override=reduction_override)
+        assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with has_acted=False and use_sigmoid=False
+    for use_sigmoid in [True, False]:
+        loss_class(
+            use_sigmoid=use_sigmoid, activate=True,
+            naive_dice=naive_dice)(pred, target)
+        assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with weight.ndim != loss.ndim
+    with pytest.raises(AssertionError):
+        weight = torch.rand((2, 8))
+        loss_class(naive_dice=naive_dice)(pred, target, weight)
+
+    # Test loss forward with len(weight) != len(pred)
+    with pytest.raises(AssertionError):
+        weight = torch.rand(8)
+        loss_class(naive_dice=naive_dice)(pred, target, weight)
+
+    # Test _expand_onehot_labels_dice
+    pred = torch.tensor([[[[1, 1], [1, 0]], [[0, 1], [1, 1]]]]).float()
+    target = torch.tensor([[[0, 0], [0, 1]]])
+    target_onehot = torch.tensor([[[[1, 1], [1, 0]], [[0, 0], [0, 1]]]])
+    weight = torch.rand(1)
+    loss = loss_class(naive_dice=naive_dice)(pred, target, weight)
+    loss_onehot = loss_class(naive_dice=naive_dice)(pred, target_onehot,
+                                                    weight)
+    assert torch.equal(loss, loss_onehot)
+
+    # Test Whether Loss is 0 when pred == target, eps == 0 and naive_dice=False
+    target = torch.randint(0, 2, (1, 10, 4, 4))
+    pred = target.float()
+    target = target.sigmoid()
+    weight = torch.rand(1)
+    loss = loss_class(
+        naive_dice=False, use_sigmoid=True, eps=0)(pred, target, weight)
+    assert loss.item() == 0
+
+    # Test ignore_index when ignore_index is the only class
+    with pytest.raises(AssertionError):
+        pred = torch.ones((1, 1, 4, 4))
+        target = torch.randint(0, 1, (1, 4, 4))
+        weight = torch.rand(1)
+        loss = loss_class(
+            naive_dice=naive_dice, use_sigmoid=False, ignore_index=0,
+            eps=0)(pred, target, weight)
+
+    # Test ignore_index with naive_dice = False
+    pred = torch.tensor([[[[1, 1], [1, 0]], [[0, 1], [1, 1]]]]).float()
+    target = torch.tensor([[[[1, 1], [1, 0]], [[1, 0], [0, 1]]]]).sigmoid()
+    weight = torch.rand(1)
+    loss = loss_class(
+        naive_dice=False, use_sigmoid=True, ignore_index=1,
+        eps=0)(pred, target, weight)
+    assert loss.item() == 0
diff --git a/tests/test_models/test_losses/test_huasdorff_distance_loss.py b/tests/test_models/test_losses/test_huasdorff_distance_loss.py
new file mode 100644
index 0000000000..29c2732d3f
--- /dev/null
+++ b/tests/test_models/test_losses/test_huasdorff_distance_loss.py
@@ -0,0 +1,29 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+from mmseg.models.losses import HuasdorffDisstanceLoss
+
+
+def test_huasdorff_distance_loss():
+    loss_class = HuasdorffDisstanceLoss
+    pred = torch.rand((10, 8, 6, 6))
+    target = torch.rand((10, 6, 6))
+    class_weight = torch.rand(8)
+
+    # Test loss forward
+    loss = loss_class()(pred, target)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with avg_factor
+    loss = loss_class()(pred, target, avg_factor=10)
+    assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with avg_factor and reduction is None, 'sum' and 'mean'
+    for reduction in [None, 'sum', 'mean']:
+        loss = loss_class()(pred, target, avg_factor=10, reduction=reduction)
+        assert isinstance(loss, torch.Tensor)
+
+    # Test loss forward with class_weight
+    with pytest.raises(AssertionError):
+        loss_class(class_weight=class_weight)(pred, target)
diff --git a/tests/test_models/test_losses/test_kldiv_loss.py b/tests/test_models/test_losses/test_kldiv_loss.py
new file mode 100644
index 0000000000..48bcc4bfd9
--- /dev/null
+++ b/tests/test_models/test_losses/test_kldiv_loss.py
@@ -0,0 +1,40 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+
+from mmseg.models.losses.kldiv_loss import KLDivLoss
+
+
+def test_kldiv_loss_with_none_reduction():
+    loss_class = KLDivLoss
+    pred = torch.rand((8, 5, 5))
+    target = torch.rand((8, 5, 5))
+    reduction = 'none'
+
+    # Test loss forward
+    loss = loss_class(reduction=reduction)(pred, target)
+    assert isinstance(loss, torch.Tensor)
+    assert loss.shape == (8, 5, 5), f'{loss.shape}'
+
+
+def test_kldiv_loss_with_mean_reduction():
+    loss_class = KLDivLoss
+    pred = torch.rand((8, 5, 5))
+    target = torch.rand((8, 5, 5))
+    reduction = 'mean'
+
+    # Test loss forward
+    loss = loss_class(reduction=reduction)(pred, target)
+    assert isinstance(loss, torch.Tensor)
+    assert loss.shape == (8, ), f'{loss.shape}'
+
+
+def test_kldiv_loss_with_sum_reduction():
+    loss_class = KLDivLoss
+    pred = torch.rand((8, 5, 5))
+    target = torch.rand((8, 5, 5))
+    reduction = 'sum'
+
+    # Test loss forward
+    loss = loss_class(reduction=reduction)(pred, target)
+    assert isinstance(loss, torch.Tensor)
+    assert loss.shape == (8, ), f'{loss.shape}'
diff --git a/tests/test_models/test_losses/test_silog_loss.py b/tests/test_models/test_losses/test_silog_loss.py
new file mode 100644
index 0000000000..022434bcc1
--- /dev/null
+++ b/tests/test_models/test_losses/test_silog_loss.py
@@ -0,0 +1,20 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from unittest import TestCase
+
+import torch
+
+from mmseg.models.losses import SiLogLoss
+
+
+class TestSiLogLoss(TestCase):
+
+    def test_SiLogLoss_forward(self):
+        pred = torch.tensor([[1.0, 2.0], [3.5, 4.0]], dtype=torch.float32)
+        target = torch.tensor([[0.0, 2.0], [3.0, 4.0]], dtype=torch.float32)
+        weight = torch.tensor([1.0, 0.5], dtype=torch.float32)
+
+        loss_module = SiLogLoss()
+        loss = loss_module.forward(pred, target, weight)
+
+        expected_loss = 0.02
+        self.assertAlmostEqual(loss.item(), expected_loss, places=2)
diff --git a/tests/test_models/test_losses/test_tversky_loss.py b/tests/test_models/test_losses/test_tversky_loss.py
new file mode 100644
index 0000000000..c5c581d8b4
--- /dev/null
+++ b/tests/test_models/test_losses/test_tversky_loss.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import pytest
+import torch
+
+
+def test_tversky_lose():
+    from mmseg.models import build_loss
+
+    # test alpha + beta != 1
+    with pytest.raises(AssertionError):
+        loss_cfg = dict(
+            type='TverskyLoss',
+            class_weight=[1.0, 2.0, 3.0],
+            loss_weight=1.0,
+            alpha=0.4,
+            beta=0.7,
+            loss_name='loss_tversky')
+        tversky_loss = build_loss(loss_cfg)
+        logits = torch.rand(8, 3, 4, 4)
+        labels = (torch.rand(8, 4, 4) * 3).long()
+        tversky_loss(logits, labels, ignore_index=1)
+
+    # test tversky loss
+    loss_cfg = dict(
+        type='TverskyLoss',
+        class_weight=[1.0, 2.0, 3.0],
+        loss_weight=1.0,
+        ignore_index=1,
+        loss_name='loss_tversky')
+    tversky_loss = build_loss(loss_cfg)
+    logits = torch.rand(8, 3, 4, 4)
+    labels = (torch.rand(8, 4, 4) * 3).long()
+    tversky_loss(logits, labels)
+
+    # test loss with class weights from file
+    import os
+    import tempfile
+
+    import mmengine
+    import numpy as np
+    tmp_file = tempfile.NamedTemporaryFile()
+
+    mmengine.dump([1.0, 2.0, 3.0], f'{tmp_file.name}.pkl',
+                  'pkl')  # from pkl file
+    loss_cfg = dict(
+        type='TverskyLoss',
+        class_weight=f'{tmp_file.name}.pkl',
+        loss_weight=1.0,
+        ignore_index=1,
+        loss_name='loss_tversky')
+    tversky_loss = build_loss(loss_cfg)
+    tversky_loss(logits, labels)
+
+    np.save(f'{tmp_file.name}.npy', np.array([1.0, 2.0, 3.0]))  # from npy file
+    loss_cfg = dict(
+        type='TverskyLoss',
+        class_weight=f'{tmp_file.name}.pkl',
+        loss_weight=1.0,
+        ignore_index=1,
+        loss_name='loss_tversky')
+    tversky_loss = build_loss(loss_cfg)
+    tversky_loss(logits, labels)
+    tmp_file.close()
+    os.remove(f'{tmp_file.name}.pkl')
+    os.remove(f'{tmp_file.name}.npy')
+
+    # test tversky loss has name `loss_tversky`
+    loss_cfg = dict(
+        type='TverskyLoss',
+        smooth=2,
+        loss_weight=1.0,
+        ignore_index=1,
+        alpha=0.3,
+        beta=0.7,
+        loss_name='loss_tversky')
+    tversky_loss = build_loss(loss_cfg)
+    assert tversky_loss.loss_name == 'loss_tversky'
diff --git a/tests/test_models/test_segmentors/__init__.py b/tests/test_models/test_segmentors/__init__.py
new file mode 100644
index 0000000000..ef101fec61
--- /dev/null
+++ b/tests/test_models/test_segmentors/__init__.py
@@ -0,0 +1 @@
+# Copyright (c) OpenMMLab. All rights reserved.
diff --git a/tests/test_models/test_segmentors/test_cascade_encoder_decoder.py b/tests/test_models/test_segmentors/test_cascade_encoder_decoder.py
new file mode 100644
index 0000000000..941816d253
--- /dev/null
+++ b/tests/test_models/test_segmentors/test_cascade_encoder_decoder.py
@@ -0,0 +1,57 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import ConfigDict
+
+from mmseg.models import build_segmentor
+from .utils import _segmentor_forward_train_test
+
+
+def test_cascade_encoder_decoder():
+
+    # test 1 decode head, w.o. aux head
+    cfg = ConfigDict(
+        type='CascadeEncoderDecoder',
+        num_stages=2,
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=[
+            dict(type='ExampleDecodeHead'),
+            dict(type='ExampleCascadeDecodeHead')
+        ])
+    cfg.test_cfg = ConfigDict(mode='whole')
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
+
+    # test slide mode
+    cfg.test_cfg = ConfigDict(mode='slide', crop_size=(3, 3), stride=(2, 2))
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
+
+    # test 1 decode head, 1 aux head
+    cfg = ConfigDict(
+        type='CascadeEncoderDecoder',
+        num_stages=2,
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=[
+            dict(type='ExampleDecodeHead'),
+            dict(type='ExampleCascadeDecodeHead')
+        ],
+        auxiliary_head=dict(type='ExampleDecodeHead'))
+    cfg.test_cfg = ConfigDict(mode='whole')
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
+
+    # test 1 decode head, 2 aux head
+    cfg = ConfigDict(
+        type='CascadeEncoderDecoder',
+        num_stages=2,
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=[
+            dict(type='ExampleDecodeHead'),
+            dict(type='ExampleCascadeDecodeHead')
+        ],
+        auxiliary_head=[
+            dict(type='ExampleDecodeHead'),
+            dict(type='ExampleDecodeHead')
+        ])
+    cfg.test_cfg = ConfigDict(mode='whole')
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
diff --git a/tests/test_models/test_segmentors/test_depth_estimator.py b/tests/test_models/test_segmentors/test_depth_estimator.py
new file mode 100644
index 0000000000..e819c9e763
--- /dev/null
+++ b/tests/test_models/test_segmentors/test_depth_estimator.py
@@ -0,0 +1,64 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from copy import deepcopy
+from os.path import dirname, join
+from unittest import TestCase
+
+import torch
+from mmengine import Config, ConfigDict
+from mmengine.structures import PixelData
+
+import mmseg
+from mmseg.models.segmentors import DepthEstimator
+from mmseg.structures import SegDataSample
+
+
+class TestDepthEstimator(TestCase):
+
+    def setUp(self) -> None:
+        repo_dpath = dirname(dirname(mmseg.__file__))
+        config_dpath = join(repo_dpath, 'configs/_base_/models/vpd_sd.py')
+        vpd_cfg = Config.fromfile(config_dpath).stable_diffusion_cfg
+        vpd_cfg.pop('checkpoint')
+
+        backbone_cfg = dict(
+            type='VPD',
+            diffusion_cfg=vpd_cfg,
+            class_embed_path='https://download.openmmlab.com/mmsegmentation/'
+            'v0.5/vpd/nyu_class_embeddings.pth',
+            class_embed_select=True,
+            pad_shape=64,
+            unet_cfg=dict(use_attn=False),
+        )
+
+        head_cfg = dict(
+            type='VPDDepthHead',
+            max_depth=10,
+        )
+
+        self.model = DepthEstimator(
+            backbone=backbone_cfg, decode_head=head_cfg)
+
+        inputs = torch.randn(1, 3, 64, 80)
+        data_sample = SegDataSample()
+        data_sample.gt_depth_map = PixelData(data=torch.rand(1, 64, 80))
+        data_sample.set_metainfo(dict(img_shape=(64, 80), ori_shape=(64, 80)))
+        self.data = dict(inputs=inputs, data_samples=[data_sample])
+
+    def test_slide_flip_inference(self):
+
+        self.model.test_cfg = ConfigDict(
+            dict(mode='slide_flip', crop_size=(64, 64), stride=(16, 16)))
+
+        with torch.no_grad():
+            out = self.model.predict(**deepcopy(self.data))
+
+        self.assertEqual(len(out), 1)
+        self.assertIn('pred_depth_map', out[0].keys())
+        self.assertListEqual(list(out[0].pred_depth_map.shape), [64, 80])
+
+    def test__forward(self):
+        data = deepcopy(self.data)
+        data['inputs'] = data['inputs'][:, :, :64, :64]
+        with torch.no_grad():
+            out = self.model._forward(**data)
+        self.assertListEqual(list(out.shape), [1, 1, 64, 64])
diff --git a/tests/test_models/test_segmentors/test_encoder_decoder.py b/tests/test_models/test_segmentors/test_encoder_decoder.py
new file mode 100644
index 0000000000..5795f513d3
--- /dev/null
+++ b/tests/test_models/test_segmentors/test_encoder_decoder.py
@@ -0,0 +1,100 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine import ConfigDict
+from mmengine.structures import PixelData
+
+from mmseg.models import build_segmentor
+from mmseg.structures import SegDataSample
+from .utils import _segmentor_forward_train_test
+
+
+def test_encoder_decoder():
+
+    # test 1 decode head, w.o. aux head
+
+    cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(type='ExampleDecodeHead'),
+        train_cfg=None,
+        test_cfg=dict(mode='whole'))
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
+
+    # test out_channels == 1
+    cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(
+            type='ExampleDecodeHead', num_classes=2, out_channels=1),
+        train_cfg=None,
+        test_cfg=dict(mode='whole'))
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
+
+    # test slide mode
+    cfg.test_cfg = ConfigDict(mode='slide', crop_size=(3, 3), stride=(2, 2))
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
+
+    # test 1 decode head, 1 aux head
+    cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(type='ExampleDecodeHead'),
+        auxiliary_head=dict(type='ExampleDecodeHead'))
+    cfg.test_cfg = ConfigDict(mode='whole')
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
+
+    # test 1 decode head, 2 aux head
+    cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(type='ExampleDecodeHead'),
+        auxiliary_head=[
+            dict(type='ExampleDecodeHead'),
+            dict(type='ExampleDecodeHead')
+        ])
+    cfg.test_cfg = ConfigDict(mode='whole')
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
+
+
+def test_postprocess_result():
+    cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(type='ExampleDecodeHead'),
+        train_cfg=None,
+        test_cfg=dict(mode='whole'))
+    model = build_segmentor(cfg)
+
+    # test postprocess
+    data_sample = SegDataSample()
+    data_sample.gt_sem_seg = PixelData(
+        **{'data': torch.randint(0, 10, (1, 8, 8))})
+    data_sample.set_metainfo({
+        'padding_size': (0, 2, 0, 2),
+        'ori_shape': (8, 8)
+    })
+    seg_logits = torch.zeros((1, 2, 10, 10))
+    seg_logits[:, :, :8, :8] = 1
+    data_samples = [data_sample]
+
+    outputs = model.postprocess_result(seg_logits, data_samples)
+    assert outputs[0].seg_logits.data.shape == torch.Size((2, 8, 8))
+    assert torch.allclose(outputs[0].seg_logits.data, torch.ones((2, 8, 8)))
+
+    data_sample = SegDataSample()
+    data_sample.gt_sem_seg = PixelData(
+        **{'data': torch.randint(0, 10, (1, 8, 8))})
+    data_sample.set_metainfo({
+        'img_padding_size': (0, 2, 0, 2),
+        'ori_shape': (8, 8)
+    })
+
+    data_samples = [data_sample]
+    outputs = model.postprocess_result(seg_logits, data_samples)
+    assert outputs[0].seg_logits.data.shape == torch.Size((2, 8, 8))
+    assert torch.allclose(outputs[0].seg_logits.data, torch.ones((2, 8, 8)))
diff --git a/tests/test_models/test_segmentors/test_multimodal_encoder_decoder.py b/tests/test_models/test_segmentors/test_multimodal_encoder_decoder.py
new file mode 100644
index 0000000000..75258d89a7
--- /dev/null
+++ b/tests/test_models/test_segmentors/test_multimodal_encoder_decoder.py
@@ -0,0 +1,24 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+from mmengine import ConfigDict
+
+from mmseg.models import build_segmentor
+from tests.test_models.test_segmentors.utils import \
+    _segmentor_forward_train_test
+
+
+def test_multimodal_encoder_decoder():
+
+    cfg = ConfigDict(
+        type='MultimodalEncoderDecoder',
+        asymetric_input=False,
+        image_encoder=dict(type='ExampleBackbone', out_indices=[1, 2, 3, 4]),
+        text_encoder=dict(
+            type='ExampleTextEncoder',
+            vocabulary=['A', 'B', 'C'],
+            output_dims=3),
+        decode_head=dict(
+            type='ExampleDecodeHead', out_channels=1, num_classes=2),
+        train_cfg=None,
+        test_cfg=dict(mode='whole'))
+    segmentor = build_segmentor(cfg)
+    _segmentor_forward_train_test(segmentor)
diff --git a/tests/test_models/test_segmentors/test_seg_tta_model.py b/tests/test_models/test_segmentors/test_seg_tta_model.py
new file mode 100644
index 0000000000..1e152ed056
--- /dev/null
+++ b/tests/test_models/test_segmentors/test_seg_tta_model.py
@@ -0,0 +1,63 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import tempfile
+
+import torch
+from mmengine import ConfigDict
+from mmengine.model import BaseTTAModel
+from mmengine.registry import init_default_scope
+from mmengine.structures import PixelData
+
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+from .utils import *  # noqa: F401,F403
+
+init_default_scope('mmseg')
+
+
+def test_encoder_decoder_tta():
+
+    segmentor_cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(type='ExampleDecodeHead'),
+        train_cfg=None,
+        test_cfg=dict(mode='whole'))
+
+    cfg = ConfigDict(type='SegTTAModel', module=segmentor_cfg)
+
+    model: BaseTTAModel = MODELS.build(cfg)
+
+    imgs = []
+    data_samples = []
+    directions = ['horizontal', 'vertical']
+    for i in range(12):
+        flip_direction = directions[0] if i % 3 == 0 else directions[1]
+        imgs.append(torch.randn(1, 3, 10 + i, 10 + i))
+        data_samples.append([
+            SegDataSample(
+                metainfo=dict(
+                    ori_shape=(10, 10),
+                    img_shape=(10 + i, 10 + i),
+                    flip=(i % 2 == 0),
+                    flip_direction=flip_direction,
+                    img_path=tempfile.mktemp()),
+                gt_sem_seg=PixelData(data=torch.randint(0, 19, (1, 10, 10))))
+        ])
+
+    model.test_step(dict(inputs=imgs, data_samples=data_samples))
+
+    # test out_channels == 1
+    segmentor_cfg = ConfigDict(
+        type='EncoderDecoder',
+        backbone=dict(type='ExampleBackbone'),
+        decode_head=dict(
+            type='ExampleDecodeHead',
+            num_classes=2,
+            out_channels=1,
+            threshold=0.4),
+        train_cfg=None,
+        test_cfg=dict(mode='whole'))
+    model.module = MODELS.build(segmentor_cfg)
+    for data_sample in data_samples:
+        data_sample[0].gt_sem_seg.data = torch.randint(0, 2, (1, 10, 10))
+    model.test_step(dict(inputs=imgs, data_samples=data_samples))
diff --git a/tests/test_models/test_segmentors/utils.py b/tests/test_models/test_segmentors/utils.py
new file mode 100644
index 0000000000..ac31e2b277
--- /dev/null
+++ b/tests/test_models/test_segmentors/utils.py
@@ -0,0 +1,182 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import torch
+from mmengine.optim import OptimWrapper
+from mmengine.structures import PixelData
+from torch import nn
+from torch.optim import SGD
+
+from mmseg.models import SegDataPreProcessor
+from mmseg.models.decode_heads.cascade_decode_head import BaseCascadeDecodeHead
+from mmseg.models.decode_heads.decode_head import BaseDecodeHead
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+
+
+def _demo_mm_inputs(input_shape=(1, 3, 8, 16), num_classes=10):
+    """Create a superset of inputs needed to run test or train batches.
+
+    Args:
+        input_shape (tuple):
+            input batch dimensions
+
+        num_classes (int):
+            number of semantic classes
+    """
+    (N, C, H, W) = input_shape
+
+    imgs = torch.randn(*input_shape)
+    segs = torch.randint(
+        low=0, high=num_classes - 1, size=(N, H, W), dtype=torch.long)
+
+    img_metas = [{
+        'img_shape': (H, W),
+        'ori_shape': (H, W),
+        'pad_shape': (H, W, C),
+        'filename': '<demo>.png',
+        'scale_factor': 1.0,
+        'flip': False,
+        'flip_direction': 'horizontal'
+    } for _ in range(N)]
+
+    data_samples = [
+        SegDataSample(
+            gt_sem_seg=PixelData(data=segs[i]), metainfo=img_metas[i])
+        for i in range(N)
+    ]
+
+    mm_inputs = {'imgs': torch.FloatTensor(imgs), 'data_samples': data_samples}
+
+    return mm_inputs
+
+
+@MODELS.register_module()
+class ExampleBackbone(nn.Module):
+
+    def __init__(self, out_indices=None):
+        super().__init__()
+        self.conv = nn.Conv2d(3, 3, 3)
+        self.out_indices = out_indices
+
+    def init_weights(self, pretrained=None):
+        pass
+
+    def forward(self, x):
+        if self.out_indices is None:
+            return [self.conv(x)]
+        else:
+            outs = []
+            for i in self.out_indices:
+                outs.append(self.conv(x))
+        return outs
+
+
+@MODELS.register_module()
+class ExampleDecodeHead(BaseDecodeHead):
+
+    def __init__(self, num_classes=19, out_channels=None, **kwargs):
+        super().__init__(
+            3, 3, num_classes=num_classes, out_channels=out_channels, **kwargs)
+
+    def forward(self, inputs):
+        return self.cls_seg(inputs[0])
+
+
+@MODELS.register_module()
+class ExampleTextEncoder(nn.Module):
+
+    def __init__(self, vocabulary=None, output_dims=None):
+        super().__init__()
+        self.vocabulary = vocabulary
+        self.output_dims = output_dims
+
+    def forward(self):
+        return torch.randn((len(self.vocabulary), self.output_dims))
+
+
+@MODELS.register_module()
+class ExampleCascadeDecodeHead(BaseCascadeDecodeHead):
+
+    def __init__(self):
+        super().__init__(3, 3, num_classes=19)
+
+    def forward(self, inputs, prev_out):
+        return self.cls_seg(inputs[0])
+
+
+def _segmentor_forward_train_test(segmentor):
+    if isinstance(segmentor.decode_head, nn.ModuleList):
+        num_classes = segmentor.decode_head[-1].num_classes
+    else:
+        num_classes = segmentor.decode_head.num_classes
+    # batch_size=2 for BatchNorm
+    mm_inputs = _demo_mm_inputs(num_classes=num_classes)
+
+    # convert to cuda Tensor if applicable
+    if torch.cuda.is_available():
+        segmentor = segmentor.cuda()
+
+    # check data preprocessor
+    if not hasattr(segmentor,
+                   'data_preprocessor') or segmentor.data_preprocessor is None:
+        segmentor.data_preprocessor = SegDataPreProcessor()
+
+    mm_inputs = segmentor.data_preprocessor(mm_inputs, True)
+    imgs = mm_inputs.pop('imgs')
+    data_samples = mm_inputs.pop('data_samples')
+
+    # create optimizer wrapper
+    optimizer = SGD(segmentor.parameters(), lr=0.1)
+    optim_wrapper = OptimWrapper(optimizer)
+
+    # Test forward train
+    losses = segmentor.forward(imgs, data_samples, mode='loss')
+    assert isinstance(losses, dict)
+
+    # Test train_step
+    data_batch = dict(inputs=imgs, data_samples=data_samples)
+    outputs = segmentor.train_step(data_batch, optim_wrapper)
+    assert isinstance(outputs, dict)
+    assert 'loss' in outputs
+
+    # Test val_step
+    with torch.no_grad():
+        segmentor.eval()
+        data_batch = dict(inputs=imgs, data_samples=data_samples)
+        outputs = segmentor.val_step(data_batch)
+        assert isinstance(outputs, list)
+
+    # Test forward simple test
+    with torch.no_grad():
+        segmentor.eval()
+        data_batch = dict(inputs=imgs, data_samples=data_samples)
+        results = segmentor.forward(imgs, data_samples, mode='tensor')
+        assert isinstance(results, torch.Tensor)
+
+
+def _segmentor_predict(segmentor):
+    if isinstance(segmentor.decode_head, nn.ModuleList):
+        num_classes = segmentor.decode_head[-1].num_classes
+    else:
+        num_classes = segmentor.decode_head.num_classes
+    # batch_size=2 for BatchNorm
+    mm_inputs = _demo_mm_inputs(num_classes=num_classes)
+
+    # convert to cuda Tensor if applicable
+    if torch.cuda.is_available():
+        segmentor = segmentor.cuda()
+
+    # check data preprocessor
+    if not hasattr(segmentor,
+                   'data_preprocessor') or segmentor.data_preprocessor is None:
+        segmentor.data_preprocessor = SegDataPreProcessor()
+
+    mm_inputs = segmentor.data_preprocessor(mm_inputs, True)
+    imgs = mm_inputs.pop('imgs')
+    data_samples = mm_inputs.pop('data_samples')
+
+    # Test predict
+    with torch.no_grad():
+        segmentor.eval()
+        data_batch = dict(inputs=imgs, data_samples=data_samples)
+        outputs = segmentor.predict(**data_batch)
+        assert isinstance(outputs, list)
diff --git a/tests/test_sampler.py b/tests/test_sampler.py
index 12490ef3c9..322be9579b 100644
--- a/tests/test_sampler.py
+++ b/tests/test_sampler.py
@@ -2,8 +2,8 @@
 import pytest
 import torch
 
-from mmseg.data import OHEMPixelSampler
 from mmseg.models.decode_heads import FCNHead
+from mmseg.structures import OHEMPixelSampler
 
 
 def _context_for_ohem():
diff --git a/tests/test_data/test_seg_data_sample.py b/tests/test_structures/test_seg_data_sample.py
similarity index 96%
rename from tests/test_data/test_seg_data_sample.py
rename to tests/test_structures/test_seg_data_sample.py
index 9bf5b476d7..37796b611d 100644
--- a/tests/test_data/test_seg_data_sample.py
+++ b/tests/test_structures/test_seg_data_sample.py
@@ -4,9 +4,9 @@
 import numpy as np
 import pytest
 import torch
-from mmengine.data import PixelData
+from mmengine.structures import PixelData
 
-from mmseg.data import SegDataSample
+from mmseg.structures import SegDataSample
 
 
 def _equal(a, b):
diff --git a/tests/test_utils/test_io.py b/tests/test_utils/test_io.py
new file mode 100644
index 0000000000..05abd275f8
--- /dev/null
+++ b/tests/test_utils/test_io.py
@@ -0,0 +1,33 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os.path as osp
+
+import numpy as np
+import pytest
+from mmengine import FileClient
+
+from mmseg.utils import datafrombytes
+
+
+@pytest.mark.parametrize(
+    ['backend', 'suffix'],
+    [['nifti', '.nii.gz'], ['numpy', '.npy'], ['pickle', '.pkl']])
+def test_datafrombytes(backend, suffix):
+
+    file_client = FileClient('disk')
+    file_path = osp.join(osp.dirname(__file__), '../data/biomedical' + suffix)
+    bytes = file_client.get(file_path)
+    data = datafrombytes(bytes, backend)
+
+    if backend == 'pickle':
+        # test pickle loading
+        assert isinstance(data, dict)
+    else:
+        assert isinstance(data, np.ndarray)
+        if backend == 'nifti':
+            # test nifti file loading
+            assert len(data.shape) == 3
+        else:
+            # test npy file loading
+            # testing data biomedical.npy includes data and label
+            assert len(data.shape) == 4
+            assert data.shape[0] == 2
diff --git a/tests/test_visualization/test_local_visualizer.py b/tests/test_visualization/test_local_visualizer.py
new file mode 100644
index 0000000000..e3b2a88cfb
--- /dev/null
+++ b/tests/test_visualization/test_local_visualizer.py
@@ -0,0 +1,213 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import os
+import os.path as osp
+import tempfile
+from unittest import TestCase
+
+import cv2
+import mmcv
+import numpy as np
+import torch
+from mmengine.structures import PixelData
+
+from mmseg.structures import SegDataSample
+from mmseg.visualization import SegLocalVisualizer
+
+
+class TestSegLocalVisualizer(TestCase):
+
+    def test_add_datasample(self):
+        h = 10
+        w = 12
+        num_class = 2
+        out_file = 'out_file'
+
+        image = np.random.randint(0, 256, size=(h, w, 3)).astype('uint8')
+
+        # test gt_sem_seg
+        gt_sem_seg_data = dict(data=torch.randint(0, num_class, (1, h, w)))
+        gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+        def test_add_datasample_forward(gt_sem_seg):
+            data_sample = SegDataSample()
+            data_sample.gt_sem_seg = gt_sem_seg
+
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                seg_local_visualizer = SegLocalVisualizer(
+                    vis_backends=[dict(type='LocalVisBackend')],
+                    save_dir=tmp_dir)
+                seg_local_visualizer.dataset_meta = dict(
+                    classes=('background', 'foreground'),
+                    palette=[[120, 120, 120], [6, 230, 230]])
+
+                # test out_file
+                seg_local_visualizer.add_datasample(out_file, image,
+                                                    data_sample)
+
+                assert os.path.exists(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'))
+                drawn_img = cv2.imread(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'))
+                assert drawn_img.shape == (h, w, 3)
+
+                # test gt_instances and pred_instances
+                pred_sem_seg_data = dict(
+                    data=torch.randint(0, num_class, (1, h, w)))
+                pred_sem_seg = PixelData(**pred_sem_seg_data)
+
+                data_sample.pred_sem_seg = pred_sem_seg
+
+                seg_local_visualizer.add_datasample(out_file, image,
+                                                    data_sample)
+                self._assert_image_and_shape(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'), (h, w * 2, 3))
+
+                seg_local_visualizer.add_datasample(
+                    out_file, image, data_sample, draw_gt=False)
+                self._assert_image_and_shape(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'), (h, w, 3))
+
+        if torch.cuda.is_available():
+            test_add_datasample_forward(gt_sem_seg.cuda())
+        test_add_datasample_forward(gt_sem_seg)
+
+    def test_cityscapes_add_datasample(self):
+        h = 128
+        w = 256
+        num_class = 19
+        out_file = 'out_file_cityscapes'
+
+        image = mmcv.imread(
+            osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_cityscapes_dataset/leftImg8bit/val/frankfurt/frankfurt_000000_000294_leftImg8bit.png'  # noqa
+            ),
+            'color')
+        sem_seg = mmcv.imread(
+            osp.join(
+                osp.dirname(__file__),
+                '../data/pseudo_cityscapes_dataset/gtFine/val/frankfurt/frankfurt_000000_000294_gtFine_labelTrainIds.png'  # noqa
+            ),
+            'unchanged')
+        sem_seg = torch.unsqueeze(torch.from_numpy(sem_seg), 0)
+        gt_sem_seg_data = dict(data=sem_seg)
+        gt_sem_seg = PixelData(**gt_sem_seg_data)
+
+        def test_cityscapes_add_datasample_forward(gt_sem_seg):
+            data_sample = SegDataSample()
+            data_sample.gt_sem_seg = gt_sem_seg
+
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                seg_local_visualizer = SegLocalVisualizer(
+                    vis_backends=[dict(type='LocalVisBackend')],
+                    save_dir=tmp_dir)
+                seg_local_visualizer.dataset_meta = dict(
+                    classes=('road', 'sidewalk', 'building', 'wall', 'fence',
+                             'pole', 'traffic light', 'traffic sign',
+                             'vegetation', 'terrain', 'sky', 'person', 'rider',
+                             'car', 'truck', 'bus', 'train', 'motorcycle',
+                             'bicycle'),
+                    palette=[[128, 64, 128], [244, 35, 232], [70, 70, 70],
+                             [102, 102, 156], [190, 153, 153], [153, 153, 153],
+                             [250, 170, 30], [220, 220, 0], [107, 142, 35],
+                             [152, 251, 152], [70, 130, 180], [220, 20, 60],
+                             [255, 0, 0], [0, 0, 142], [0, 0, 70],
+                             [0, 60, 100], [0, 80, 100], [0, 0, 230],
+                             [119, 11, 32]])
+                # test out_file
+                seg_local_visualizer.add_datasample(
+                    out_file,
+                    image,
+                    data_sample,
+                    out_file=osp.join(tmp_dir, 'test.png'))
+                self._assert_image_and_shape(
+                    osp.join(tmp_dir, 'test.png'), (h, w, 3))
+
+                # test gt_instances and pred_instances
+                pred_sem_seg_data = dict(
+                    data=torch.randint(0, num_class, (1, h, w)))
+                pred_sem_seg = PixelData(**pred_sem_seg_data)
+
+                data_sample.pred_sem_seg = pred_sem_seg
+
+                # test draw prediction with gt
+                seg_local_visualizer.add_datasample(out_file, image,
+                                                    data_sample)
+                self._assert_image_and_shape(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'), (h, w * 2, 3))
+                # test draw prediction without gt
+                seg_local_visualizer.add_datasample(
+                    out_file, image, data_sample, draw_gt=False)
+                self._assert_image_and_shape(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'), (h, w, 3))
+
+        if torch.cuda.is_available():
+            test_cityscapes_add_datasample_forward(gt_sem_seg.cuda())
+        test_cityscapes_add_datasample_forward(gt_sem_seg)
+
+    def _assert_image_and_shape(self, out_file, out_shape):
+        assert os.path.exists(out_file)
+        drawn_img = cv2.imread(out_file)
+        assert drawn_img.shape == out_shape
+
+    def test_add_datasample_depth(self):
+        h = 10
+        w = 12
+        out_file = 'out_file'
+
+        image = np.random.randint(0, 256, size=(h, w, 3)).astype('uint8')
+
+        # test gt_depth_map
+        gt_depth_map = PixelData(data=torch.rand(1, h, w))
+
+        def test_add_datasample_forward_depth(gt_depth_map):
+            data_sample = SegDataSample()
+            data_sample.gt_depth_map = gt_depth_map
+
+            with tempfile.TemporaryDirectory() as tmp_dir:
+                seg_local_visualizer = SegLocalVisualizer(
+                    vis_backends=[dict(type='LocalVisBackend')],
+                    save_dir=tmp_dir)
+                seg_local_visualizer.dataset_meta = dict(
+                    classes=('background', 'foreground'),
+                    palette=[[120, 120, 120], [6, 230, 230]])
+
+                # test out_file
+                seg_local_visualizer.add_datasample(out_file, image,
+                                                    data_sample)
+
+                assert os.path.exists(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'))
+                drawn_img = cv2.imread(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'))
+                assert drawn_img.shape == (h * 2, w, 3)
+
+                # test gt_instances and pred_instances
+
+                pred_depth_map = PixelData(data=torch.rand(1, h, w))
+
+                data_sample.pred_depth_map = pred_depth_map
+
+                seg_local_visualizer.add_datasample(out_file, image,
+                                                    data_sample)
+                self._assert_image_and_shape(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'), (h * 2, w * 2, 3))
+
+                seg_local_visualizer.add_datasample(
+                    out_file, image, data_sample, draw_gt=False)
+                self._assert_image_and_shape(
+                    osp.join(tmp_dir, 'vis_data', 'vis_image',
+                             out_file + '_0.png'), (h * 2, w, 3))
+
+        if torch.cuda.is_available():
+            test_add_datasample_forward_depth(gt_depth_map.cuda())
+        test_add_datasample_forward_depth(gt_depth_map)
diff --git a/tools/analyze_logs.py b/tools/analysis_tools/analyze_logs.py
similarity index 98%
rename from tools/analyze_logs.py
rename to tools/analysis_tools/analyze_logs.py
index abf9982974..7464d23162 100644
--- a/tools/analyze_logs.py
+++ b/tools/analysis_tools/analyze_logs.py
@@ -101,7 +101,7 @@ def load_json_logs(json_logs):
     log_dicts = [dict() for _ in json_logs]
     prev_step = 0
     for json_log, log_dict in zip(json_logs, log_dicts):
-        with open(json_log, 'r') as log_file:
+        with open(json_log) as log_file:
             for line in log_file:
                 log = json.loads(line.strip())
                 # the final step in json file is 0.
@@ -110,7 +110,7 @@ def load_json_logs(json_logs):
                     prev_step = step
                 else:
                     step = prev_step
-                if step not in log:
+                if step not in log_dict:
                     log_dict[step] = defaultdict(list)
                 for k, v in log.items():
                     log_dict[step][k].append(v)
diff --git a/tools/benchmark.py b/tools/analysis_tools/benchmark.py
similarity index 82%
rename from tools/benchmark.py
rename to tools/analysis_tools/benchmark.py
index c11ca4a759..afaeabac85 100644
--- a/tools/benchmark.py
+++ b/tools/analysis_tools/benchmark.py
@@ -3,16 +3,16 @@
 import os.path as osp
 import time
 
-import mmcv
 import numpy as np
 import torch
-from mmcv import Config
-from mmcv.runner import load_checkpoint, wrap_fp16_model
-from mmengine.runner import Runner
-from mmengine.utils import revert_sync_batchnorm
+from mmengine import Config
+from mmengine.fileio import dump
+from mmengine.model.utils import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+from mmengine.runner import Runner, load_checkpoint
+from mmengine.utils import mkdir_or_exist
 
 from mmseg.registry import MODELS
-from mmseg.utils import register_all_modules
 
 
 def parse_args():
@@ -32,17 +32,19 @@ def parse_args():
 
 def main():
     args = parse_args()
-    register_all_modules()
     cfg = Config.fromfile(args.config)
+
+    init_default_scope(cfg.get('default_scope', 'mmseg'))
+
     timestamp = time.strftime('%Y%m%d_%H%M%S', time.localtime())
     if args.work_dir is not None:
-        mmcv.mkdir_or_exist(osp.abspath(args.work_dir))
+        mkdir_or_exist(osp.abspath(args.work_dir))
         json_file = osp.join(args.work_dir, f'fps_{timestamp}.json')
     else:
         # use config filename as default work_dir if cfg.work_dir is None
         work_dir = osp.join('./work_dirs',
                             osp.splitext(osp.basename(args.config))[0])
-        mmcv.mkdir_or_exist(osp.abspath(work_dir))
+        mkdir_or_exist(osp.abspath(work_dir))
         json_file = osp.join(work_dir, f'fps_{timestamp}.json')
 
     repeat_times = args.repeat_times
@@ -52,6 +54,7 @@ def main():
 
     benchmark_dict = dict(config=args.config, unit='img / s')
     overall_fps_list = []
+    cfg.test_dataloader.batch_size = 1
     for time_index in range(repeat_times):
         print(f'Run {time_index + 1}:')
         # build the dataloader
@@ -60,16 +63,14 @@ def main():
         # build the model and load checkpoint
         cfg.model.train_cfg = None
         model = MODELS.build(cfg.model)
-        fp16_cfg = cfg.get('fp16', None)
-        if fp16_cfg is not None:
-            wrap_fp16_model(model)
+
         if 'checkpoint' in args and osp.exists(args.checkpoint):
             load_checkpoint(model, args.checkpoint, map_location='cpu')
 
         if torch.cuda.is_available():
             model = model.cuda()
-        else:
-            model = revert_sync_batchnorm(model)
+
+        model = revert_sync_batchnorm(model)
 
         model.eval()
 
@@ -80,14 +81,15 @@ def main():
 
         # benchmark with 200 batches and take the average
         for i, data in enumerate(data_loader):
-            batch_inputs, data_samples = model.data_preprocessor(data, True)
-
+            data = model.data_preprocessor(data, True)
+            inputs = data['inputs']
+            data_samples = data['data_samples']
             if torch.cuda.is_available():
                 torch.cuda.synchronize()
             start_time = time.perf_counter()
 
             with torch.no_grad():
-                model(batch_inputs, data_samples, mode='predict')
+                model(inputs, data_samples, mode='predict')
 
             if torch.cuda.is_available():
                 torch.cuda.synchronize()
@@ -112,7 +114,7 @@ def main():
           f'{benchmark_dict["average_fps"]}')
     print(f'The variance of {repeat_times} evaluations: '
           f'{benchmark_dict["fps_variance"]}')
-    mmcv.dump(benchmark_dict, json_file, indent=4)
+    dump(benchmark_dict, json_file, indent=4)
 
 
 if __name__ == '__main__':
diff --git a/tools/analysis_tools/browse_dataset.py b/tools/analysis_tools/browse_dataset.py
new file mode 100644
index 0000000000..925c14a8ab
--- /dev/null
+++ b/tools/analysis_tools/browse_dataset.py
@@ -0,0 +1,77 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from mmengine.config import Config, DictAction
+from mmengine.utils import ProgressBar
+
+from mmseg.registry import DATASETS, VISUALIZERS
+from mmseg.utils import register_all_modules
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # register all modules in mmdet into the registries
+    register_all_modules()
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.metainfo
+
+    progress_bar = ProgressBar(len(dataset))
+    for item in dataset:
+        img = item['inputs'].permute(1, 2, 0).numpy()
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+        data_sample = item['data_samples'].numpy()
+        img_path = osp.basename(item['data_samples'].img_path)
+
+        out_file = osp.join(
+            args.output_dir,
+            osp.basename(img_path)) if args.output_dir is not None else None
+
+        visualizer.add_datasample(
+            name=osp.basename(img_path),
+            image=img,
+            data_sample=data_sample,
+            draw_gt=True,
+            draw_pred=False,
+            wait_time=args.show_interval,
+            out_file=out_file,
+            show=not args.not_show)
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/confusion_matrix.py b/tools/analysis_tools/confusion_matrix.py
similarity index 75%
rename from tools/confusion_matrix.py
rename to tools/analysis_tools/confusion_matrix.py
index 2c5b64cf4e..39756cdfdd 100644
--- a/tools/confusion_matrix.py
+++ b/tools/analysis_tools/confusion_matrix.py
@@ -3,12 +3,16 @@
 import os
 
 import matplotlib.pyplot as plt
-import mmcv
 import numpy as np
 from matplotlib.ticker import MultipleLocator
-from mmcv import Config, DictAction
+from mmengine.config import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import mkdir_or_exist, progressbar
+from PIL import Image
 
-from mmseg.datasets import build_dataset
+from mmseg.registry import DATASETS
+
+init_default_scope('mmseg')
 
 
 def parse_args():
@@ -16,7 +20,7 @@ def parse_args():
         description='Generate confusion matrix from segmentation results')
     parser.add_argument('config', help='test config file path')
     parser.add_argument(
-        'prediction_path', help='prediction path where test .pkl result')
+        'prediction_path', help='prediction path where test folder result')
     parser.add_argument(
         'save_dir', help='directory where confusion matrix will be saved')
     parser.add_argument(
@@ -50,15 +54,23 @@ def calculate_confusion_matrix(dataset, results):
         dataset (Dataset): Test or val dataset.
         results (list[ndarray]): A list of segmentation results in each image.
     """
-    n = len(dataset.CLASSES)
+    n = len(dataset.METAINFO['classes'])
     confusion_matrix = np.zeros(shape=[n, n])
     assert len(dataset) == len(results)
-    prog_bar = mmcv.ProgressBar(len(results))
+    ignore_index = dataset.ignore_index
+    reduce_zero_label = dataset.reduce_zero_label
+    prog_bar = progressbar.ProgressBar(len(results))
     for idx, per_img_res in enumerate(results):
         res_segm = per_img_res
-        gt_segm = dataset.get_gt_seg_map_by_idx(idx)
+        gt_segm = dataset[idx]['data_samples'] \
+            .gt_sem_seg.data.squeeze().numpy().astype(np.uint8)
+        gt_segm, res_segm = gt_segm.flatten(), res_segm.flatten()
+        if reduce_zero_label:
+            gt_segm = gt_segm - 1
+        to_ignore = gt_segm == ignore_index
+
+        gt_segm, res_segm = gt_segm[~to_ignore], res_segm[~to_ignore]
         inds = n * gt_segm + res_segm
-        inds = inds.flatten()
         mat = np.bincount(inds, minlength=n**2).reshape(n, n)
         confusion_matrix += mat
         prog_bar.update()
@@ -70,7 +82,7 @@ def plot_confusion_matrix(confusion_matrix,
                           save_dir=None,
                           show=True,
                           title='Normalized Confusion Matrix',
-                          color_theme='winter'):
+                          color_theme='OrRd'):
     """Draw confusion matrix with matplotlib.
 
     Args:
@@ -89,14 +101,15 @@ def plot_confusion_matrix(confusion_matrix,
 
     num_classes = len(labels)
     fig, ax = plt.subplots(
-        figsize=(2 * num_classes, 2 * num_classes * 0.8), dpi=180)
+        figsize=(2 * num_classes, 2 * num_classes * 0.8), dpi=300)
     cmap = plt.get_cmap(color_theme)
     im = ax.imshow(confusion_matrix, cmap=cmap)
-    plt.colorbar(mappable=im, ax=ax)
+    colorbar = plt.colorbar(mappable=im, ax=ax)
+    colorbar.ax.tick_params(labelsize=20)  # 设置 colorbar 标签的字体大小
 
-    title_font = {'weight': 'bold', 'size': 12}
+    title_font = {'weight': 'bold', 'size': 20}
     ax.set_title(title, fontdict=title_font)
-    label_font = {'size': 10}
+    label_font = {'size': 40}
     plt.ylabel('Ground Truth Label', fontdict=label_font)
     plt.xlabel('Prediction Label', fontdict=label_font)
 
@@ -116,8 +129,8 @@ def plot_confusion_matrix(confusion_matrix,
     # draw label
     ax.set_xticks(np.arange(num_classes))
     ax.set_yticks(np.arange(num_classes))
-    ax.set_xticklabels(labels)
-    ax.set_yticklabels(labels)
+    ax.set_xticklabels(labels, fontsize=20)
+    ax.set_yticklabels(labels, fontsize=20)
 
     ax.tick_params(
         axis='x', bottom=False, top=True, labelbottom=False, labeltop=True)
@@ -135,13 +148,14 @@ def plot_confusion_matrix(confusion_matrix,
                           ) if not np.isnan(confusion_matrix[i, j]) else -1),
                 ha='center',
                 va='center',
-                color='w',
-                size=7)
+                color='k',
+                size=20)
 
     ax.set_ylim(len(confusion_matrix) - 0.5, -0.5)  # matplotlib>3.1.1
 
     fig.tight_layout()
     if save_dir is not None:
+        mkdir_or_exist(save_dir)
         plt.savefig(
             os.path.join(save_dir, 'confusion_matrix.png'), format='png')
     if show:
@@ -155,7 +169,12 @@ def main():
     if args.cfg_options is not None:
         cfg.merge_from_dict(args.cfg_options)
 
-    results = mmcv.load(args.prediction_path)
+    results = []
+    for img in sorted(os.listdir(args.prediction_path)):
+        img = os.path.join(args.prediction_path, img)
+        image = Image.open(img)
+        image = np.copy(image)
+        results.append(image)
 
     assert isinstance(results, list)
     if isinstance(results[0], np.ndarray):
@@ -163,17 +182,11 @@ def main():
     else:
         raise TypeError('invalid type of prediction results')
 
-    if isinstance(cfg.data.test, dict):
-        cfg.data.test.test_mode = True
-    elif isinstance(cfg.data.test, list):
-        for ds_cfg in cfg.data.test:
-            ds_cfg.test_mode = True
-
-    dataset = build_dataset(cfg.data.test)
+    dataset = DATASETS.build(cfg.test_dataloader.dataset)
     confusion_matrix = calculate_confusion_matrix(dataset, results)
     plot_confusion_matrix(
         confusion_matrix,
-        dataset.CLASSES,
+        dataset.METAINFO['classes'],
         save_dir=args.save_dir,
         show=args.show,
         title=args.title,
diff --git a/tools/analysis_tools/get_flops.py b/tools/analysis_tools/get_flops.py
new file mode 100644
index 0000000000..66b2d52fcd
--- /dev/null
+++ b/tools/analysis_tools/get_flops.py
@@ -0,0 +1,124 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import tempfile
+from pathlib import Path
+
+import torch
+from mmengine import Config, DictAction
+from mmengine.logging import MMLogger
+from mmengine.model import revert_sync_batchnorm
+from mmengine.registry import init_default_scope
+
+from mmseg.models import BaseSegmentor
+from mmseg.registry import MODELS
+from mmseg.structures import SegDataSample
+
+try:
+    from mmengine.analysis import get_model_complexity_info
+    from mmengine.analysis.print_helper import _format_size
+except ImportError:
+    raise ImportError('Please upgrade mmengine >= 0.6.0 to use this script.')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Get the FLOPs of a segmentor')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--shape',
+        type=int,
+        nargs='+',
+        default=[2048, 1024],
+        help='input image size')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def inference(args: argparse.Namespace, logger: MMLogger) -> dict:
+    config_name = Path(args.config)
+
+    if not config_name.exists():
+        logger.error(f'Config file {config_name} does not exist')
+
+    cfg: Config = Config.fromfile(config_name)
+    cfg.work_dir = tempfile.TemporaryDirectory().name
+    cfg.log_level = 'WARN'
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    init_default_scope(cfg.get('scope', 'mmseg'))
+
+    if len(args.shape) == 1:
+        input_shape = (3, args.shape[0], args.shape[0])
+    elif len(args.shape) == 2:
+        input_shape = (3, ) + tuple(args.shape)
+    else:
+        raise ValueError('invalid input shape')
+    result = {}
+
+    model: BaseSegmentor = MODELS.build(cfg.model)
+    if hasattr(model, 'auxiliary_head'):
+        model.auxiliary_head = None
+    if torch.cuda.is_available():
+        model.cuda()
+    model = revert_sync_batchnorm(model)
+    result['ori_shape'] = input_shape[-2:]
+    result['pad_shape'] = input_shape[-2:]
+    data_batch = {
+        'inputs': [torch.rand(input_shape)],
+        'data_samples': [SegDataSample(metainfo=result)]
+    }
+    data = model.data_preprocessor(data_batch)
+    model.eval()
+    if cfg.model.decode_head.type in ['MaskFormerHead', 'Mask2FormerHead']:
+        # TODO: Support MaskFormer and Mask2Former
+        raise NotImplementedError('MaskFormer and Mask2Former are not '
+                                  'supported yet.')
+    outputs = get_model_complexity_info(
+        model,
+        input_shape,
+        inputs=data['inputs'],
+        show_table=False,
+        show_arch=False)
+    result['flops'] = _format_size(outputs['flops'])
+    result['params'] = _format_size(outputs['params'])
+    result['compute_type'] = 'direct: randomly generate a picture'
+    return result
+
+
+def main():
+
+    args = parse_args()
+    logger = MMLogger.get_instance(name='MMLogger')
+
+    result = inference(args, logger)
+    split_line = '=' * 30
+    ori_shape = result['ori_shape']
+    pad_shape = result['pad_shape']
+    flops = result['flops']
+    params = result['params']
+    compute_type = result['compute_type']
+
+    if pad_shape != ori_shape:
+        print(f'{split_line}\nUse size divisor set input shape '
+              f'from {ori_shape} to {pad_shape}')
+    print(f'{split_line}\nCompute type: {compute_type}\n'
+          f'Input shape: {pad_shape}\nFlops: {flops}\n'
+          f'Params: {params}\n{split_line}')
+    print('!!!Please be cautious if you use the results in papers. '
+          'You may need to check if all ops are supported and verify '
+          'that the flops computation is correct.')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/analysis_tools/visualization_cam.py b/tools/analysis_tools/visualization_cam.py
new file mode 100644
index 0000000000..00cdb3e04a
--- /dev/null
+++ b/tools/analysis_tools/visualization_cam.py
@@ -0,0 +1,127 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+"""Use the pytorch-grad-cam tool to visualize Class Activation Maps (CAM).
+
+requirement: pip install grad-cam
+"""
+
+from argparse import ArgumentParser
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from mmengine import Config
+from mmengine.model import revert_sync_batchnorm
+from PIL import Image
+from pytorch_grad_cam import GradCAM
+from pytorch_grad_cam.utils.image import preprocess_image, show_cam_on_image
+
+from mmseg.apis import inference_model, init_model, show_result_pyplot
+from mmseg.utils import register_all_modules
+
+
+class SemanticSegmentationTarget:
+    """wrap the model.
+
+    requirement: pip install grad-cam
+
+    Args:
+        category (int): Visualization class.
+        mask (ndarray): Mask of class.
+        size (tuple): Image size.
+    """
+
+    def __init__(self, category, mask, size):
+        self.category = category
+        self.mask = torch.from_numpy(mask)
+        self.size = size
+        if torch.cuda.is_available():
+            self.mask = self.mask.cuda()
+
+    def __call__(self, model_output):
+        model_output = torch.unsqueeze(model_output, dim=0)
+        model_output = F.interpolate(
+            model_output, size=self.size, mode='bilinear')
+        model_output = torch.squeeze(model_output, dim=0)
+
+        return (model_output[self.category, :, :] * self.mask).sum()
+
+
+def main():
+    parser = ArgumentParser()
+    parser.add_argument('img', help='Image file')
+    parser.add_argument('config', help='Config file')
+    parser.add_argument('checkpoint', help='Checkpoint file')
+    parser.add_argument(
+        '--out-file',
+        default='prediction.png',
+        help='Path to output prediction file')
+    parser.add_argument(
+        '--cam-file', default='vis_cam.png', help='Path to output cam file')
+    parser.add_argument(
+        '--target-layers',
+        default='backbone.layer4[2]',
+        help='Target layers to visualize CAM')
+    parser.add_argument(
+        '--category-index', default='7', help='Category to visualize CAM')
+    parser.add_argument(
+        '--device', default='cuda:0', help='Device used for inference')
+    args = parser.parse_args()
+
+    # build the model from a config file and a checkpoint file
+    register_all_modules()
+    model = init_model(args.config, args.checkpoint, device=args.device)
+    if args.device == 'cpu':
+        model = revert_sync_batchnorm(model)
+
+    # test a single image
+    result = inference_model(model, args.img)
+
+    # show the results
+    show_result_pyplot(
+        model,
+        args.img,
+        result,
+        draw_gt=False,
+        show=False if args.out_file is not None else True,
+        out_file=args.out_file)
+
+    # result data conversion
+    prediction_data = result.pred_sem_seg.data
+    pre_np_data = prediction_data.cpu().numpy().squeeze(0)
+
+    target_layers = args.target_layers
+    target_layers = [eval(f'model.{target_layers}')]
+
+    category = int(args.category_index)
+    mask_float = np.float32(pre_np_data == category)
+
+    # data processing
+    image = np.array(Image.open(args.img).convert('RGB'))
+    height, width = image.shape[0], image.shape[1]
+    rgb_img = np.float32(image) / 255
+    config = Config.fromfile(args.config)
+    image_mean = config.data_preprocessor['mean']
+    image_std = config.data_preprocessor['std']
+    input_tensor = preprocess_image(
+        rgb_img,
+        mean=[x / 255 for x in image_mean],
+        std=[x / 255 for x in image_std])
+
+    # Grad CAM(Class Activation Maps)
+    # Can also be LayerCAM, XGradCAM, GradCAMPlusPlus, EigenCAM, EigenGradCAM
+    targets = [
+        SemanticSegmentationTarget(category, mask_float, (height, width))
+    ]
+    with GradCAM(
+            model=model,
+            target_layers=target_layers,
+            use_cuda=torch.cuda.is_available()) as cam:
+        grayscale_cam = cam(input_tensor=input_tensor, targets=targets)[0, :]
+        cam_image = show_cam_on_image(rgb_img, grayscale_cam, use_rgb=True)
+
+        # save cam file
+        Image.fromarray(cam_image).save(args.cam_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/browse_dataset.py b/tools/browse_dataset.py
deleted file mode 100644
index 64fe695859..0000000000
--- a/tools/browse_dataset.py
+++ /dev/null
@@ -1,182 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import os
-import warnings
-from pathlib import Path
-
-import mmcv
-import numpy as np
-from mmcv import Config, DictAction
-
-from mmseg.datasets import DATASETS
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description='Browse a dataset')
-    parser.add_argument('config', help='train config file path')
-    parser.add_argument(
-        '--show-origin',
-        default=False,
-        action='store_true',
-        help='if True, omit all augmentation in pipeline,'
-        ' show origin image and seg map')
-    parser.add_argument(
-        '--skip-type',
-        type=str,
-        nargs='+',
-        default=['DefaultFormatBundle', 'Normalize', 'Collect'],
-        help='skip some useless pipeline，if `show-origin` is true, '
-        'all pipeline except `Load` will be skipped')
-    parser.add_argument(
-        '--output-dir',
-        default='./output',
-        type=str,
-        help='If there is no display interface, you can save it')
-    parser.add_argument('--show', default=False, action='store_true')
-    parser.add_argument(
-        '--show-interval',
-        type=int,
-        default=999,
-        help='the interval of show (ms)')
-    parser.add_argument(
-        '--opacity',
-        type=float,
-        default=0.5,
-        help='the opacity of semantic map')
-    parser.add_argument(
-        '--cfg-options',
-        nargs='+',
-        action=DictAction,
-        help='override some settings in the used config, the key-value pair '
-        'in xxx=yyy format will be merged into config file. If the value to '
-        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
-        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
-        'Note that the quotation marks are necessary and that no white space '
-        'is allowed.')
-    args = parser.parse_args()
-    return args
-
-
-def imshow_semantic(img,
-                    seg,
-                    class_names,
-                    palette=None,
-                    win_name='',
-                    show=False,
-                    wait_time=0,
-                    out_file=None,
-                    opacity=0.5):
-    """Draw `result` over `img`.
-
-    Args:
-        img (str or Tensor): The image to be displayed.
-        seg (Tensor): The semantic segmentation results to draw over
-            `img`.
-        class_names (list[str]): Names of each classes.
-        palette (list[list[int]]] | np.ndarray | None): The palette of
-            segmentation map. If None is given, random palette will be
-            generated. Default: None
-        win_name (str): The window name.
-        wait_time (int): Value of waitKey param.
-            Default: 0.
-        show (bool): Whether to show the image.
-            Default: False.
-        out_file (str or None): The filename to write the image.
-            Default: None.
-        opacity(float): Opacity of painted segmentation map.
-            Default 0.5.
-            Must be in (0, 1] range.
-    Returns:
-        img (Tensor): Only if not `show` or `out_file`
-    """
-    img = mmcv.imread(img)
-    img = img.copy()
-    if palette is None:
-        palette = np.random.randint(0, 255, size=(len(class_names), 3))
-    palette = np.array(palette)
-    assert palette.shape[0] == len(class_names)
-    assert palette.shape[1] == 3
-    assert len(palette.shape) == 2
-    assert 0 < opacity <= 1.0
-    color_seg = np.zeros((seg.shape[0], seg.shape[1], 3), dtype=np.uint8)
-    for label, color in enumerate(palette):
-        color_seg[seg == label, :] = color
-    # convert to BGR
-    color_seg = color_seg[..., ::-1]
-
-    img = img * (1 - opacity) + color_seg * opacity
-    img = img.astype(np.uint8)
-    # if out_file specified, do not show image in window
-    if out_file is not None:
-        show = False
-
-    if show:
-        mmcv.imshow(img, win_name, wait_time)
-    if out_file is not None:
-        mmcv.imwrite(img, out_file)
-
-    if not (show or out_file):
-        warnings.warn('show==False and out_file is not specified, only '
-                      'result image will be returned')
-        return img
-
-
-def _retrieve_data_cfg(_data_cfg, skip_type, show_origin):
-    if show_origin is True:
-        # only keep pipeline of Loading data and ann
-        _data_cfg['pipeline'] = [
-            x for x in _data_cfg.pipeline if 'Load' in x['type']
-        ]
-    else:
-        _data_cfg['pipeline'] = [
-            x for x in _data_cfg.pipeline if x['type'] not in skip_type
-        ]
-
-
-def retrieve_data_cfg(config_path, skip_type, cfg_options, show_origin=False):
-    cfg = Config.fromfile(config_path)
-    if cfg_options is not None:
-        cfg.merge_from_dict(cfg_options)
-    train_data_cfg = cfg.data.train
-    if isinstance(train_data_cfg, list):
-        for _data_cfg in train_data_cfg:
-            while 'dataset' in _data_cfg and _data_cfg[
-                    'type'] != 'MultiImageMixDataset':
-                _data_cfg = _data_cfg['dataset']
-            if 'pipeline' in _data_cfg:
-                _retrieve_data_cfg(_data_cfg, skip_type, show_origin)
-            else:
-                raise ValueError
-    else:
-        while 'dataset' in train_data_cfg and train_data_cfg[
-                'type'] != 'MultiImageMixDataset':
-            train_data_cfg = train_data_cfg['dataset']
-        _retrieve_data_cfg(train_data_cfg, skip_type, show_origin)
-    return cfg
-
-
-def main():
-    args = parse_args()
-    cfg = retrieve_data_cfg(args.config, args.skip_type, args.cfg_options,
-                            args.show_origin)
-    dataset = DATASETS.build(cfg.data.train)
-    progress_bar = mmcv.ProgressBar(len(dataset))
-    for item in dataset:
-        filename = os.path.join(args.output_dir,
-                                Path(item['filename']).name
-                                ) if args.output_dir is not None else None
-        imshow_semantic(
-            item['img'],
-            item['gt_semantic_seg'],
-            dataset.CLASSES,
-            dataset.PALETTE,
-            show=args.show,
-            wait_time=args.show_interval,
-            out_file=filename,
-            opacity=args.opacity,
-        )
-        progress_bar.update()
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/convert_datasets/cityscapes.py b/tools/convert_datasets/cityscapes.py
deleted file mode 100644
index 17b6168478..0000000000
--- a/tools/convert_datasets/cityscapes.py
+++ /dev/null
@@ -1,56 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import os.path as osp
-
-import mmcv
-from cityscapesscripts.preparation.json2labelImg import json2labelImg
-
-
-def convert_json_to_label(json_file):
-    label_file = json_file.replace('_polygons.json', '_labelTrainIds.png')
-    json2labelImg(json_file, label_file, 'trainIds')
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Convert Cityscapes annotations to TrainIds')
-    parser.add_argument('cityscapes_path', help='cityscapes data path')
-    parser.add_argument('--gt-dir', default='gtFine', type=str)
-    parser.add_argument('-o', '--out-dir', help='output path')
-    parser.add_argument(
-        '--nproc', default=1, type=int, help='number of process')
-    args = parser.parse_args()
-    return args
-
-
-def main():
-    args = parse_args()
-    cityscapes_path = args.cityscapes_path
-    out_dir = args.out_dir if args.out_dir else cityscapes_path
-    mmcv.mkdir_or_exist(out_dir)
-
-    gt_dir = osp.join(cityscapes_path, args.gt_dir)
-
-    poly_files = []
-    for poly in mmcv.scandir(gt_dir, '_polygons.json', recursive=True):
-        poly_file = osp.join(gt_dir, poly)
-        poly_files.append(poly_file)
-    if args.nproc > 1:
-        mmcv.track_parallel_progress(convert_json_to_label, poly_files,
-                                     args.nproc)
-    else:
-        mmcv.track_progress(convert_json_to_label, poly_files)
-
-    split_names = ['train', 'val', 'test']
-
-    for split in split_names:
-        filenames = []
-        for poly in mmcv.scandir(
-                osp.join(gt_dir, split), '_polygons.json', recursive=True):
-            filenames.append(poly.replace('_gtFine_polygons.json', ''))
-        with open(osp.join(out_dir, f'{split}.txt'), 'w') as f:
-            f.writelines(f + '\n' for f in filenames)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/convert_datasets/chase_db1.py b/tools/dataset_converters/chase_db1.py
similarity index 84%
rename from tools/convert_datasets/chase_db1.py
rename to tools/dataset_converters/chase_db1.py
index 580e6e7ec5..f4fefbd774 100644
--- a/tools/convert_datasets/chase_db1.py
+++ b/tools/dataset_converters/chase_db1.py
@@ -6,6 +6,7 @@
 import zipfile
 
 import mmcv
+from mmengine.utils import mkdir_or_exist
 
 CHASE_DB1_LEN = 28 * 3
 TRAINING_LEN = 60
@@ -30,13 +31,13 @@ def main():
         out_dir = args.out_dir
 
     print('Making directories...')
-    mmcv.mkdir_or_exist(out_dir)
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
 
     with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
         print('Extracting CHASEDB1.zip...')
@@ -46,7 +47,7 @@ def main():
         print('Generating training dataset...')
 
         assert len(os.listdir(tmp_dir)) == CHASE_DB1_LEN, \
-            'len(os.listdir(tmp_dir)) != {}'.format(CHASE_DB1_LEN)
+            f'len(os.listdir(tmp_dir)) != {CHASE_DB1_LEN}'
 
         for img_name in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
             img = mmcv.imread(osp.join(tmp_dir, img_name))
diff --git a/tools/dataset_converters/cityscapes.py b/tools/dataset_converters/cityscapes.py
new file mode 100644
index 0000000000..0d6a80135d
--- /dev/null
+++ b/tools/dataset_converters/cityscapes.py
@@ -0,0 +1,56 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from cityscapesscripts.preparation.json2labelImg import json2labelImg
+from mmengine.utils import (mkdir_or_exist, scandir, track_parallel_progress,
+                            track_progress)
+
+
+def convert_json_to_label(json_file):
+    label_file = json_file.replace('_polygons.json', '_labelTrainIds.png')
+    json2labelImg(json_file, label_file, 'trainIds')
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert Cityscapes annotations to TrainIds')
+    parser.add_argument('cityscapes_path', help='cityscapes data path')
+    parser.add_argument('--gt-dir', default='gtFine', type=str)
+    parser.add_argument('-o', '--out-dir', help='output path')
+    parser.add_argument(
+        '--nproc', default=1, type=int, help='number of process')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cityscapes_path = args.cityscapes_path
+    out_dir = args.out_dir if args.out_dir else cityscapes_path
+    mkdir_or_exist(out_dir)
+
+    gt_dir = osp.join(cityscapes_path, args.gt_dir)
+
+    poly_files = []
+    for poly in scandir(gt_dir, '_polygons.json', recursive=True):
+        poly_file = osp.join(gt_dir, poly)
+        poly_files.append(poly_file)
+    if args.nproc > 1:
+        track_parallel_progress(convert_json_to_label, poly_files, args.nproc)
+    else:
+        track_progress(convert_json_to_label, poly_files)
+
+    split_names = ['train', 'val', 'test']
+
+    for split in split_names:
+        filenames = []
+        for poly in scandir(
+                osp.join(gt_dir, split), '_polygons.json', recursive=True):
+            filenames.append(poly.replace('_gtFine_polygons.json', ''))
+        with open(osp.join(out_dir, f'{split}.txt'), 'w') as f:
+            f.writelines(f + '\n' for f in filenames)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/convert_datasets/coco_stuff10k.py b/tools/dataset_converters/coco_stuff10k.py
similarity index 93%
rename from tools/convert_datasets/coco_stuff10k.py
rename to tools/dataset_converters/coco_stuff10k.py
index 374f819703..920127ee10 100644
--- a/tools/convert_datasets/coco_stuff10k.py
+++ b/tools/dataset_converters/coco_stuff10k.py
@@ -4,8 +4,9 @@
 import shutil
 from functools import partial
 
-import mmcv
 import numpy as np
+from mmengine.utils import (mkdir_or_exist, track_parallel_progress,
+                            track_progress)
 from PIL import Image
 from scipy.io import loadmat
 
@@ -251,10 +252,10 @@ def main():
     out_img_dir = osp.join(out_dir, 'images')
     out_mask_dir = osp.join(out_dir, 'annotations')
 
-    mmcv.mkdir_or_exist(osp.join(out_img_dir, 'train2014'))
-    mmcv.mkdir_or_exist(osp.join(out_img_dir, 'test2014'))
-    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'train2014'))
-    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'test2014'))
+    mkdir_or_exist(osp.join(out_img_dir, 'train2014'))
+    mkdir_or_exist(osp.join(out_img_dir, 'test2014'))
+    mkdir_or_exist(osp.join(out_mask_dir, 'train2014'))
+    mkdir_or_exist(osp.join(out_mask_dir, 'test2014'))
 
     train_list, test_list = generate_coco_list(coco_path)
     assert (len(train_list) +
@@ -262,7 +263,7 @@ def main():
                 len(train_list), len(test_list))
 
     if args.nproc > 1:
-        mmcv.track_parallel_progress(
+        track_parallel_progress(
             partial(
                 convert_to_trainID,
                 in_img_dir=osp.join(coco_path, 'images'),
@@ -272,7 +273,7 @@ def main():
                 is_train=True),
             train_list,
             nproc=nproc)
-        mmcv.track_parallel_progress(
+        track_parallel_progress(
             partial(
                 convert_to_trainID,
                 in_img_dir=osp.join(coco_path, 'images'),
@@ -283,7 +284,7 @@ def main():
             test_list,
             nproc=nproc)
     else:
-        mmcv.track_progress(
+        track_progress(
             partial(
                 convert_to_trainID,
                 in_img_dir=osp.join(coco_path, 'images'),
@@ -291,7 +292,7 @@ def main():
                 out_img_dir=out_img_dir,
                 out_mask_dir=out_mask_dir,
                 is_train=True), train_list)
-        mmcv.track_progress(
+        track_progress(
             partial(
                 convert_to_trainID,
                 in_img_dir=osp.join(coco_path, 'images'),
diff --git a/tools/convert_datasets/coco_stuff164k.py b/tools/dataset_converters/coco_stuff164k.py
similarity index 93%
rename from tools/convert_datasets/coco_stuff164k.py
rename to tools/dataset_converters/coco_stuff164k.py
index 6d8e2f2a31..a13114ab1e 100644
--- a/tools/convert_datasets/coco_stuff164k.py
+++ b/tools/dataset_converters/coco_stuff164k.py
@@ -5,8 +5,9 @@
 from functools import partial
 from glob import glob
 
-import mmcv
 import numpy as np
+from mmengine.utils import (mkdir_or_exist, track_parallel_progress,
+                            track_progress)
 from PIL import Image
 
 COCO_LEN = 123287
@@ -222,8 +223,8 @@ def main():
     out_img_dir = osp.join(out_dir, 'images')
     out_mask_dir = osp.join(out_dir, 'annotations')
 
-    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'train2017'))
-    mmcv.mkdir_or_exist(osp.join(out_mask_dir, 'val2017'))
+    mkdir_or_exist(osp.join(out_mask_dir, 'train2017'))
+    mkdir_or_exist(osp.join(out_mask_dir, 'val2017'))
 
     if out_dir != coco_path:
         shutil.copytree(osp.join(coco_path, 'images'), out_img_dir)
@@ -237,22 +238,22 @@ def main():
                 len(train_list), len(test_list))
 
     if args.nproc > 1:
-        mmcv.track_parallel_progress(
+        track_parallel_progress(
             partial(
                 convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
             train_list,
             nproc=nproc)
-        mmcv.track_parallel_progress(
+        track_parallel_progress(
             partial(
                 convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
             test_list,
             nproc=nproc)
     else:
-        mmcv.track_progress(
+        track_progress(
             partial(
                 convert_to_trainID, out_mask_dir=out_mask_dir, is_train=True),
             train_list)
-        mmcv.track_progress(
+        track_progress(
             partial(
                 convert_to_trainID, out_mask_dir=out_mask_dir, is_train=False),
             test_list)
diff --git a/tools/convert_datasets/drive.py b/tools/dataset_converters/drive.py
similarity index 90%
rename from tools/convert_datasets/drive.py
rename to tools/dataset_converters/drive.py
index f547579b2d..076fd05a20 100644
--- a/tools/convert_datasets/drive.py
+++ b/tools/dataset_converters/drive.py
@@ -7,6 +7,7 @@
 
 import cv2
 import mmcv
+from mmengine.utils import mkdir_or_exist
 
 
 def parse_args():
@@ -32,13 +33,13 @@ def main():
         out_dir = args.out_dir
 
     print('Making directories...')
-    mmcv.mkdir_or_exist(out_dir)
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
 
     with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
         print('Extracting training.zip...')
diff --git a/tools/convert_datasets/hrf.py b/tools/dataset_converters/hrf.py
similarity index 87%
rename from tools/convert_datasets/hrf.py
rename to tools/dataset_converters/hrf.py
index 5e016e3cae..3bfd80c9ee 100644
--- a/tools/convert_datasets/hrf.py
+++ b/tools/dataset_converters/hrf.py
@@ -6,6 +6,7 @@
 import zipfile
 
 import mmcv
+from mmengine.utils import mkdir_or_exist
 
 HRF_LEN = 15
 TRAINING_LEN = 5
@@ -47,13 +48,13 @@ def main():
         out_dir = args.out_dir
 
     print('Making directories...')
-    mmcv.mkdir_or_exist(out_dir)
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
 
     print('Generating images...')
     for now_path in images_path:
@@ -62,7 +63,7 @@ def main():
             zip_file.extractall(tmp_dir)
 
             assert len(os.listdir(tmp_dir)) == HRF_LEN, \
-                'len(os.listdir(tmp_dir)) != {}'.format(HRF_LEN)
+                f'len(os.listdir(tmp_dir)) != {HRF_LEN}'
 
             for filename in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
                 img = mmcv.imread(osp.join(tmp_dir, filename))
@@ -84,7 +85,7 @@ def main():
             zip_file.extractall(tmp_dir)
 
             assert len(os.listdir(tmp_dir)) == HRF_LEN, \
-                'len(os.listdir(tmp_dir)) != {}'.format(HRF_LEN)
+                f'len(os.listdir(tmp_dir)) != {HRF_LEN}'
 
             for filename in sorted(os.listdir(tmp_dir))[:TRAINING_LEN]:
                 img = mmcv.imread(osp.join(tmp_dir, filename))
diff --git a/tools/convert_datasets/isaid.py b/tools/dataset_converters/isaid.py
similarity index 90%
rename from tools/convert_datasets/isaid.py
rename to tools/dataset_converters/isaid.py
index 314fb894f8..1d5ccd9c77 100644
--- a/tools/convert_datasets/isaid.py
+++ b/tools/dataset_converters/isaid.py
@@ -9,6 +9,7 @@
 
 import mmcv
 import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
 from PIL import Image
 
 iSAID_palette = \
@@ -90,7 +91,7 @@ def slide_crop_image(src_path, out_dir, mode, patch_H, patch_W, overlap):
                     x_end) + '.png'
             # print(image)
             save_path_image = osp.join(out_dir, 'img_dir', mode, str(image))
-            img_patch.save(save_path_image)
+            img_patch.save(save_path_image, format='BMP')
 
 
 def slide_crop_label(src_path, out_dir, mode, patch_H, patch_W, overlap):
@@ -178,26 +179,26 @@ def main():
         out_dir = args.out_dir
 
     print('Making directories...')
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'test'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'test'))
 
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'test'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'test'))
 
     assert os.path.exists(os.path.join(dataset_path, 'train')), \
-        'train is not in {}'.format(dataset_path)
+        f'train is not in {dataset_path}'
     assert os.path.exists(os.path.join(dataset_path, 'val')), \
-        'val is not in {}'.format(dataset_path)
+        f'val is not in {dataset_path}'
     assert os.path.exists(os.path.join(dataset_path, 'test')), \
-        'test is not in {}'.format(dataset_path)
+        f'test is not in {dataset_path}'
 
     with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
         for dataset_mode in ['train', 'val', 'test']:
 
             # for dataset_mode in [ 'test']:
-            print('Extracting  {}ing.zip...'.format(dataset_mode))
+            print(f'Extracting  {dataset_mode}ing.zip...')
             img_zipp_list = glob.glob(
                 os.path.join(dataset_path, dataset_mode, 'images', '*.zip'))
             print('Find the data', img_zipp_list)
@@ -207,7 +208,7 @@ def main():
             src_path_list = glob.glob(
                 os.path.join(tmp_dir, dataset_mode, 'img', 'images', '*.png'))
 
-            src_prog_bar = mmcv.ProgressBar(len(src_path_list))
+            src_prog_bar = ProgressBar(len(src_path_list))
             for i, img_path in enumerate(src_path_list):
                 if dataset_mode != 'test':
                     slide_crop_image(img_path, out_dir, dataset_mode, patch_H,
@@ -230,7 +231,7 @@ def main():
                 lab_path_list = glob.glob(
                     os.path.join(tmp_dir, dataset_mode, 'lab', 'images',
                                  '*.png'))
-                lab_prog_bar = mmcv.ProgressBar(len(lab_path_list))
+                lab_prog_bar = ProgressBar(len(lab_path_list))
                 for i, lab_path in enumerate(lab_path_list):
                     slide_crop_label(lab_path, out_dir, dataset_mode, patch_H,
                                      patch_W, overlap)
diff --git a/tools/dataset_converters/levircd.py b/tools/dataset_converters/levircd.py
new file mode 100644
index 0000000000..8717f3e856
--- /dev/null
+++ b/tools/dataset_converters/levircd.py
@@ -0,0 +1,99 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import glob
+import math
+import os
+import os.path as osp
+
+import mmcv
+import numpy as np
+from mmengine.utils import ProgressBar
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert levir-cd dataset to mmsegmentation format')
+    parser.add_argument('--dataset_path', help='potsdam folder path')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    parser.add_argument(
+        '--clip_size',
+        type=int,
+        help='clipped size of image after preparation',
+        default=256)
+    parser.add_argument(
+        '--stride_size',
+        type=int,
+        help='stride of clipping original images',
+        default=256)
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    input_folder = args.dataset_path
+    png_files = glob.glob(
+        os.path.join(input_folder, '**/*.png'), recursive=True)
+    output_folder = args.out_dir
+    prog_bar = ProgressBar(len(png_files))
+    for png_file in png_files:
+        new_path = os.path.join(
+            output_folder,
+            os.path.relpath(os.path.dirname(png_file), input_folder))
+        os.makedirs(os.path.dirname(new_path), exist_ok=True)
+        label = False
+        if 'label' in png_file:
+            label = True
+        clip_big_image(png_file, new_path, args, label)
+        prog_bar.update()
+
+
+def clip_big_image(image_path, clip_save_dir, args, to_label=False):
+    image = mmcv.imread(image_path)
+
+    h, w, c = image.shape
+    clip_size = args.clip_size
+    stride_size = args.stride_size
+
+    num_rows = math.ceil((h - clip_size) / stride_size) if math.ceil(
+        (h - clip_size) /
+        stride_size) * stride_size + clip_size >= h else math.ceil(
+            (h - clip_size) / stride_size) + 1
+    num_cols = math.ceil((w - clip_size) / stride_size) if math.ceil(
+        (w - clip_size) /
+        stride_size) * stride_size + clip_size >= w else math.ceil(
+            (w - clip_size) / stride_size) + 1
+
+    x, y = np.meshgrid(np.arange(num_cols + 1), np.arange(num_rows + 1))
+    xmin = x * clip_size
+    ymin = y * clip_size
+
+    xmin = xmin.ravel()
+    ymin = ymin.ravel()
+    xmin_offset = np.where(xmin + clip_size > w, w - xmin - clip_size,
+                           np.zeros_like(xmin))
+    ymin_offset = np.where(ymin + clip_size > h, h - ymin - clip_size,
+                           np.zeros_like(ymin))
+    boxes = np.stack([
+        xmin + xmin_offset, ymin + ymin_offset,
+        np.minimum(xmin + clip_size, w),
+        np.minimum(ymin + clip_size, h)
+    ],
+                     axis=1)
+
+    if to_label:
+        image[image == 255] = 1
+        image = image[:, :, 0]
+    for box in boxes:
+        start_x, start_y, end_x, end_y = box
+        clipped_image = image[start_y:end_y, start_x:end_x] \
+            if to_label else image[start_y:end_y, start_x:end_x, :]
+        idx = osp.basename(image_path).split('.')[0]
+        mmcv.imwrite(
+            clipped_image.astype(np.uint8),
+            osp.join(clip_save_dir,
+                     f'{idx}_{start_x}_{start_y}_{end_x}_{end_y}.png'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/convert_datasets/loveda.py b/tools/dataset_converters/loveda.py
similarity index 76%
rename from tools/convert_datasets/loveda.py
rename to tools/dataset_converters/loveda.py
index 3a0626837b..5b0ef4bb8b 100644
--- a/tools/convert_datasets/loveda.py
+++ b/tools/dataset_converters/loveda.py
@@ -6,7 +6,7 @@
 import tempfile
 import zipfile
 
-import mmcv
+from mmengine.utils import mkdir_or_exist
 
 
 def parse_args():
@@ -28,21 +28,21 @@ def main():
         out_dir = args.out_dir
 
     print('Making directories...')
-    mmcv.mkdir_or_exist(out_dir)
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'test'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'img_dir'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'test'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
 
     assert 'Train.zip' in os.listdir(dataset_path), \
-        'Train.zip is not in {}'.format(dataset_path)
+        f'Train.zip is not in {dataset_path}'
     assert 'Val.zip' in os.listdir(dataset_path), \
-        'Val.zip is not in {}'.format(dataset_path)
+        f'Val.zip is not in {dataset_path}'
     assert 'Test.zip' in os.listdir(dataset_path), \
-        'Test.zip is not in {}'.format(dataset_path)
+        f'Test.zip is not in {dataset_path}'
 
     with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
         for dataset in ['Train', 'Val', 'Test']:
diff --git a/tools/dataset_converters/nyu.py b/tools/dataset_converters/nyu.py
new file mode 100644
index 0000000000..49e09e7af6
--- /dev/null
+++ b/tools/dataset_converters/nyu.py
@@ -0,0 +1,89 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+import shutil
+import tempfile
+import zipfile
+
+from mmengine.utils import mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert NYU Depth dataset to mmsegmentation format')
+    parser.add_argument('raw_data', help='the path of raw data')
+    parser.add_argument(
+        '-o', '--out_dir', help='output path', default='./data/nyu')
+    args = parser.parse_args()
+    return args
+
+
+def reorganize(raw_data_dir: str, out_dir: str):
+    """Reorganize NYU Depth dataset files into the required directory
+    structure.
+
+    Args:
+        raw_data_dir (str): Path to the raw data directory.
+        out_dir (str): Output directory for the organized dataset.
+    """
+
+    def move_data(data_list, dst_prefix, fname_func):
+        """Move data files from source to destination directory.
+
+        Args:
+            data_list (list): List of data file paths.
+            dst_prefix (str): Prefix to be added to destination paths.
+            fname_func (callable): Function to process file names
+        """
+        for data_item in data_list:
+            data_item = data_item.strip().strip('/')
+            new_item = fname_func(data_item)
+            shutil.move(
+                osp.join(raw_data_dir, data_item),
+                osp.join(out_dir, dst_prefix, new_item))
+
+    def process_phase(phase):
+        """Process a dataset phase (e.g., 'train' or 'test')."""
+        with open(osp.join(raw_data_dir, f'nyu_{phase}.txt')) as f:
+            data = filter(lambda x: len(x.strip()) > 0, f.readlines())
+            data = map(lambda x: x.split()[:2], data)
+            images, annos = zip(*data)
+
+            move_data(images, f'images/{phase}',
+                      lambda x: x.replace('/rgb', ''))
+            move_data(annos, f'annotations/{phase}',
+                      lambda x: x.replace('/sync_depth', ''))
+
+    process_phase('train')
+    process_phase('test')
+
+
+def main():
+    args = parse_args()
+
+    print('Making directories...')
+    mkdir_or_exist(args.out_dir)
+    for subdir in [
+            'images/train', 'images/test', 'annotations/train',
+            'annotations/test'
+    ]:
+        mkdir_or_exist(osp.join(args.out_dir, subdir))
+
+    print('Generating images and annotations...')
+
+    if args.raw_data.endswith('.zip'):
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            zip_file = zipfile.ZipFile(args.raw_data)
+            zip_file.extractall(tmp_dir)
+            reorganize(osp.join(tmp_dir, 'nyu'), args.out_dir)
+    else:
+        assert osp.isdir(
+            args.raw_data
+        ), 'the argument --raw-data should be either a zip file or directory.'
+        reorganize(args.raw_data, args.out_dir)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/convert_datasets/pascal_context.py b/tools/dataset_converters/pascal_context.py
similarity index 94%
rename from tools/convert_datasets/pascal_context.py
rename to tools/dataset_converters/pascal_context.py
index 03b79d5186..a92d1dc641 100644
--- a/tools/convert_datasets/pascal_context.py
+++ b/tools/dataset_converters/pascal_context.py
@@ -3,9 +3,9 @@
 import os.path as osp
 from functools import partial
 
-import mmcv
 import numpy as np
 from detail import Detail
+from mmengine.utils import mkdir_or_exist, track_progress
 from PIL import Image
 
 _mapping = np.sort(
@@ -53,7 +53,7 @@ def main():
     else:
         out_dir = args.out_dir
     json_path = args.json_path
-    mmcv.mkdir_or_exist(out_dir)
+    mkdir_or_exist(out_dir)
     img_dir = osp.join(devkit_path, 'VOC2010', 'JPEGImages')
 
     train_detail = Detail(json_path, img_dir, 'train')
@@ -62,10 +62,10 @@ def main():
     val_detail = Detail(json_path, img_dir, 'val')
     val_ids = val_detail.getImgs()
 
-    mmcv.mkdir_or_exist(
+    mkdir_or_exist(
         osp.join(devkit_path, 'VOC2010/ImageSets/SegmentationContext'))
 
-    train_list = mmcv.track_progress(
+    train_list = track_progress(
         partial(generate_labels, detail=train_detail, out_dir=out_dir),
         train_ids)
     with open(
@@ -73,7 +73,7 @@ def main():
                      'train.txt'), 'w') as f:
         f.writelines(line + '\n' for line in sorted(train_list))
 
-    val_list = mmcv.track_progress(
+    val_list = track_progress(
         partial(generate_labels, detail=val_detail, out_dir=out_dir), val_ids)
     with open(
             osp.join(devkit_path, 'VOC2010/ImageSets/SegmentationContext',
diff --git a/tools/convert_datasets/potsdam.py b/tools/dataset_converters/potsdam.py
similarity index 94%
rename from tools/convert_datasets/potsdam.py
rename to tools/dataset_converters/potsdam.py
index 87e67d5f17..f3c713ee2a 100644
--- a/tools/convert_datasets/potsdam.py
+++ b/tools/dataset_converters/potsdam.py
@@ -9,6 +9,7 @@
 
 import mmcv
 import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
 
 
 def parse_args():
@@ -118,10 +119,10 @@ def main():
         out_dir = args.out_dir
 
     print('Making directories...')
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
 
     zipp_list = glob.glob(os.path.join(dataset_path, '*.zip'))
     print('Find the data', zipp_list)
@@ -135,7 +136,7 @@ def main():
                 sub_tmp_dir = os.path.join(tmp_dir, os.listdir(tmp_dir)[0])
                 src_path_list = glob.glob(os.path.join(sub_tmp_dir, '*.tif'))
 
-            prog_bar = mmcv.ProgressBar(len(src_path_list))
+            prog_bar = ProgressBar(len(src_path_list))
             for i, src_path in enumerate(src_path_list):
                 idx_i, idx_j = osp.basename(src_path).split('_')[2:4]
                 data_type = 'train' if f'{idx_i}_{idx_j}' in splits[
diff --git a/tools/dataset_converters/refuge.py b/tools/dataset_converters/refuge.py
new file mode 100644
index 0000000000..1186866ab3
--- /dev/null
+++ b/tools/dataset_converters/refuge.py
@@ -0,0 +1,110 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os
+import os.path as osp
+import tempfile
+import zipfile
+
+import mmcv
+import numpy as np
+from mmengine.utils import mkdir_or_exist
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Convert REFUGE dataset to mmsegmentation format')
+    parser.add_argument('--raw_data_root', help='the root path of raw data')
+
+    parser.add_argument('--tmp_dir', help='path of the temporary directory')
+    parser.add_argument('-o', '--out_dir', help='output path')
+    args = parser.parse_args()
+    return args
+
+
+def extract_img(root: str,
+                cur_dir: str,
+                out_dir: str,
+                mode: str = 'train',
+                file_type: str = 'img') -> None:
+    """_summary_
+
+    Args:
+       Args:
+        root (str): root where the extracted data is saved
+        cur_dir (cur_dir): dir where the zip_file exists
+        out_dir (str): root dir where the data is saved
+
+        mode (str, optional): Defaults to 'train'.
+        file_type (str, optional): Defaults to 'img',else to 'mask'.
+    """
+    zip_file = zipfile.ZipFile(cur_dir)
+    zip_file.extractall(root)
+    for cur_dir, dirs, files in os.walk(root):
+        # filter child dirs and directories with "Illustration" and "MACOSX"
+        if len(dirs) == 0 and \
+                cur_dir.split('\\')[-1].find('Illustration') == -1 and \
+                cur_dir.find('MACOSX') == -1:
+
+            file_names = [
+                file for file in files
+                if file.endswith('.jpg') or file.endswith('.bmp')
+            ]
+            for filename in sorted(file_names):
+                img = mmcv.imread(osp.join(cur_dir, filename))
+
+                if file_type == 'annotations':
+                    img = img[:, :, 0]
+                    img[np.where(img == 0)] = 1
+                    img[np.where(img == 128)] = 2
+                    img[np.where(img == 255)] = 0
+                mmcv.imwrite(
+                    img,
+                    osp.join(out_dir, file_type, mode,
+                             osp.splitext(filename)[0] + '.png'))
+
+
+def main():
+    args = parse_args()
+
+    raw_data_root = args.raw_data_root
+    if args.out_dir is None:
+        out_dir = osp.join('./data', 'REFUGE')
+
+    else:
+        out_dir = args.out_dir
+
+    print('Making directories...')
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'test'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'test'))
+
+    print('Generating images and annotations...')
+    # process data from the child dir on the first rank
+    cur_dir, dirs, files = list(os.walk(raw_data_root))[0]
+    print('====================')
+
+    files = list(filter(lambda x: x.endswith('.zip'), files))
+
+    with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
+        for file in files:
+            # search data folders for training,validation,test
+            mode = list(
+                filter(lambda x: file.lower().find(x) != -1,
+                       ['training', 'test', 'validation']))[0]
+            file_root = osp.join(tmp_dir, file[:-4])
+            file_type = 'images' if file.find('Anno') == -1 and file.find(
+                'GT') == -1 else 'annotations'
+            extract_img(file_root, osp.join(cur_dir, file), out_dir, mode,
+                        file_type)
+
+    print('Done!')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/convert_datasets/stare.py b/tools/dataset_converters/stare.py
similarity index 84%
rename from tools/convert_datasets/stare.py
rename to tools/dataset_converters/stare.py
index 29b78c0003..4a23ba4dd8 100644
--- a/tools/convert_datasets/stare.py
+++ b/tools/dataset_converters/stare.py
@@ -7,6 +7,7 @@
 import tempfile
 
 import mmcv
+from mmengine.utils import mkdir_or_exist
 
 STARE_LEN = 20
 TRAINING_LEN = 10
@@ -42,17 +43,17 @@ def main():
         out_dir = args.out_dir
 
     print('Making directories...')
-    mmcv.mkdir_or_exist(out_dir)
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
+    mkdir_or_exist(out_dir)
+    mkdir_or_exist(osp.join(out_dir, 'images'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'images', 'validation'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'training'))
+    mkdir_or_exist(osp.join(out_dir, 'annotations', 'validation'))
 
     with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
-        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'gz'))
-        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'files'))
+        mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mkdir_or_exist(osp.join(tmp_dir, 'files'))
 
         print('Extracting stare-images.tar...')
         with tarfile.open(image_path) as f:
@@ -67,7 +68,7 @@ def main():
         now_dir = osp.join(tmp_dir, 'files')
 
         assert len(os.listdir(now_dir)) == STARE_LEN, \
-            'len(os.listdir(now_dir)) != {}'.format(STARE_LEN)
+            f'len(os.listdir(now_dir)) != {STARE_LEN}'
 
         for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
             img = mmcv.imread(osp.join(now_dir, filename))
@@ -86,8 +87,8 @@ def main():
         print('Removing the temporary files...')
 
     with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
-        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'gz'))
-        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'files'))
+        mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mkdir_or_exist(osp.join(tmp_dir, 'files'))
 
         print('Extracting labels-ah.tar...')
         with tarfile.open(labels_ah) as f:
@@ -102,7 +103,7 @@ def main():
         now_dir = osp.join(tmp_dir, 'files')
 
         assert len(os.listdir(now_dir)) == STARE_LEN, \
-            'len(os.listdir(now_dir)) != {}'.format(STARE_LEN)
+            f'len(os.listdir(now_dir)) != {STARE_LEN}'
 
         for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
             img = mmcv.imread(osp.join(now_dir, filename))
@@ -125,8 +126,8 @@ def main():
         print('Removing the temporary files...')
 
     with tempfile.TemporaryDirectory(dir=args.tmp_dir) as tmp_dir:
-        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'gz'))
-        mmcv.mkdir_or_exist(osp.join(tmp_dir, 'files'))
+        mkdir_or_exist(osp.join(tmp_dir, 'gz'))
+        mkdir_or_exist(osp.join(tmp_dir, 'files'))
 
         print('Extracting labels-vk.tar...')
         with tarfile.open(labels_vk) as f:
@@ -141,7 +142,7 @@ def main():
         now_dir = osp.join(tmp_dir, 'files')
 
         assert len(os.listdir(now_dir)) == STARE_LEN, \
-            'len(os.listdir(now_dir)) != {}'.format(STARE_LEN)
+            f'len(os.listdir(now_dir)) != {STARE_LEN}'
 
         for filename in sorted(os.listdir(now_dir))[:TRAINING_LEN]:
             img = mmcv.imread(osp.join(now_dir, filename))
diff --git a/tools/dataset_converters/synapse.py b/tools/dataset_converters/synapse.py
new file mode 100644
index 0000000000..42dac6b7ef
--- /dev/null
+++ b/tools/dataset_converters/synapse.py
@@ -0,0 +1,155 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+import nibabel as nib
+import numpy as np
+from mmengine.utils import mkdir_or_exist
+from PIL import Image
+
+
+def read_files_from_txt(txt_path):
+    with open(txt_path) as f:
+        files = f.readlines()
+    files = [file.strip() for file in files]
+    return files
+
+
+def read_nii_file(nii_path):
+    img = nib.load(nii_path).get_fdata()
+    return img
+
+
+def split_3d_image(img):
+    c, _, _ = img.shape
+    res = []
+    for i in range(c):
+        res.append(img[i, :, :])
+    return res
+
+
+def label_mapping(label):
+    """Label mapping from TransUNet paper setting. It only has 9 classes, which
+    are 'background', 'aorta', 'gallbladder', 'left_kidney', 'right_kidney',
+    'liver', 'pancreas', 'spleen', 'stomach', respectively. Other foreground
+    classes in original dataset are all set to background.
+
+    More details could be found here: https://arxiv.org/abs/2102.04306
+    """
+    maped_label = np.zeros_like(label)
+    maped_label[label == 8] = 1
+    maped_label[label == 4] = 2
+    maped_label[label == 3] = 3
+    maped_label[label == 2] = 4
+    maped_label[label == 6] = 5
+    maped_label[label == 11] = 6
+    maped_label[label == 1] = 7
+    maped_label[label == 7] = 8
+    return maped_label
+
+
+def pares_args():
+    parser = argparse.ArgumentParser(
+        description='Convert synapse dataset to mmsegmentation format')
+    parser.add_argument(
+        '--dataset-path', type=str, help='synapse dataset path.')
+    parser.add_argument(
+        '--save-path',
+        default='data/synapse',
+        type=str,
+        help='save path of the dataset.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = pares_args()
+    dataset_path = args.dataset_path
+    save_path = args.save_path
+
+    if not osp.exists(dataset_path):
+        raise ValueError('The dataset path does not exist. '
+                         'Please enter a correct dataset path.')
+    if not osp.exists(osp.join(dataset_path, 'img')) \
+            or not osp.exists(osp.join(dataset_path, 'label')):
+        raise FileNotFoundError('The dataset structure is incorrect. '
+                                'Please check your dataset.')
+
+    train_id = read_files_from_txt(osp.join(dataset_path, 'train.txt'))
+    train_id = [idx[3:7] for idx in train_id]
+
+    test_id = read_files_from_txt(osp.join(dataset_path, 'val.txt'))
+    test_id = [idx[3:7] for idx in test_id]
+
+    mkdir_or_exist(osp.join(save_path, 'img_dir/train'))
+    mkdir_or_exist(osp.join(save_path, 'img_dir/val'))
+    mkdir_or_exist(osp.join(save_path, 'ann_dir/train'))
+    mkdir_or_exist(osp.join(save_path, 'ann_dir/val'))
+
+    # It follows data preparation pipeline from here:
+    # https://github.com/Beckschen/TransUNet/tree/main/datasets
+    for i, idx in enumerate(train_id):
+        img_3d = read_nii_file(
+            osp.join(dataset_path, 'img', 'img' + idx + '.nii.gz'))
+        label_3d = read_nii_file(
+            osp.join(dataset_path, 'label', 'label' + idx + '.nii.gz'))
+
+        img_3d = np.clip(img_3d, -125, 275)
+        img_3d = (img_3d + 125) / 400
+        img_3d *= 255
+        img_3d = np.transpose(img_3d, [2, 0, 1])
+        img_3d = np.flip(img_3d, 2)
+
+        label_3d = np.transpose(label_3d, [2, 0, 1])
+        label_3d = np.flip(label_3d, 2)
+        label_3d = label_mapping(label_3d)
+
+        for c in range(img_3d.shape[0]):
+            img = img_3d[c]
+            label = label_3d[c]
+
+            img = Image.fromarray(img).convert('RGB')
+            label = Image.fromarray(label).convert('L')
+            img.save(
+                osp.join(
+                    save_path, 'img_dir/train', 'case' + idx.zfill(4) +
+                    '_slice' + str(c).zfill(3) + '.jpg'))
+            label.save(
+                osp.join(
+                    save_path, 'ann_dir/train', 'case' + idx.zfill(4) +
+                    '_slice' + str(c).zfill(3) + '.png'))
+
+    for i, idx in enumerate(test_id):
+        img_3d = read_nii_file(
+            osp.join(dataset_path, 'img', 'img' + idx + '.nii.gz'))
+        label_3d = read_nii_file(
+            osp.join(dataset_path, 'label', 'label' + idx + '.nii.gz'))
+
+        img_3d = np.clip(img_3d, -125, 275)
+        img_3d = (img_3d + 125) / 400
+        img_3d *= 255
+        img_3d = np.transpose(img_3d, [2, 0, 1])
+        img_3d = np.flip(img_3d, 2)
+
+        label_3d = np.transpose(label_3d, [2, 0, 1])
+        label_3d = np.flip(label_3d, 2)
+        label_3d = label_mapping(label_3d)
+
+        for c in range(img_3d.shape[0]):
+            img = img_3d[c]
+            label = label_3d[c]
+
+            img = Image.fromarray(img).convert('RGB')
+            label = Image.fromarray(label).convert('L')
+            img.save(
+                osp.join(
+                    save_path, 'img_dir/val', 'case' + idx.zfill(4) +
+                    '_slice' + str(c).zfill(3) + '.jpg'))
+            label.save(
+                osp.join(
+                    save_path, 'ann_dir/val', 'case' + idx.zfill(4) +
+                    '_slice' + str(c).zfill(3) + '.png'))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/convert_datasets/vaihingen.py b/tools/dataset_converters/vaihingen.py
similarity index 94%
rename from tools/convert_datasets/vaihingen.py
rename to tools/dataset_converters/vaihingen.py
index b025ae5a89..db980144eb 100644
--- a/tools/convert_datasets/vaihingen.py
+++ b/tools/dataset_converters/vaihingen.py
@@ -9,6 +9,7 @@
 
 import mmcv
 import numpy as np
+from mmengine.utils import ProgressBar, mkdir_or_exist
 
 
 def parse_args():
@@ -111,10 +112,10 @@ def main():
         out_dir = args.out_dir
 
     print('Making directories...')
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
-    mmcv.mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'img_dir', 'val'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'train'))
+    mkdir_or_exist(osp.join(out_dir, 'ann_dir', 'val'))
 
     zipp_list = glob.glob(os.path.join(dataset_path, '*.zip'))
     print('Find the data', zipp_list)
@@ -133,7 +134,7 @@ def main():
                 for area_ann in src_path_list:
                     if 'area9' in area_ann:
                         src_path_list.remove(area_ann)
-            prog_bar = mmcv.ProgressBar(len(src_path_list))
+            prog_bar = ProgressBar(len(src_path_list))
             for i, src_path in enumerate(src_path_list):
                 area_idx = osp.basename(src_path).split('_')[3].strip('.tif')
                 data_type = 'train' if area_idx in splits['train'] else 'val'
diff --git a/tools/convert_datasets/voc_aug.py b/tools/dataset_converters/voc_aug.py
similarity index 94%
rename from tools/convert_datasets/voc_aug.py
rename to tools/dataset_converters/voc_aug.py
index 1d42c27047..a536f4290d 100644
--- a/tools/convert_datasets/voc_aug.py
+++ b/tools/dataset_converters/voc_aug.py
@@ -3,8 +3,8 @@
 import os.path as osp
 from functools import partial
 
-import mmcv
 import numpy as np
+from mmengine.utils import mkdir_or_exist, scandir, track_parallel_progress
 from PIL import Image
 from scipy.io import loadmat
 
@@ -43,12 +43,12 @@ def main():
         out_dir = osp.join(devkit_path, 'VOC2012', 'SegmentationClassAug')
     else:
         out_dir = args.out_dir
-    mmcv.mkdir_or_exist(out_dir)
+    mkdir_or_exist(out_dir)
     in_dir = osp.join(aug_path, 'dataset', 'cls')
 
-    mmcv.track_parallel_progress(
+    track_parallel_progress(
         partial(convert_mat, in_dir=in_dir, out_dir=out_dir),
-        list(mmcv.scandir(in_dir, suffix='.mat')),
+        list(scandir(in_dir, suffix='.mat')),
         nproc=nproc)
 
     full_aug_list = []
diff --git a/tools/pytorch2torchscript.py b/tools/deployment/pytorch2torchscript.py
similarity index 97%
rename from tools/pytorch2torchscript.py
rename to tools/deployment/pytorch2torchscript.py
index d76f5ecb95..e69e705bb1 100644
--- a/tools/pytorch2torchscript.py
+++ b/tools/deployment/pytorch2torchscript.py
@@ -1,12 +1,12 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import argparse
 
-import mmcv
 import numpy as np
 import torch
 import torch._C
 import torch.serialization
-from mmcv.runner import load_checkpoint
+from mmengine import Config
+from mmengine.runner import load_checkpoint
 from torch import nn
 
 from mmseg.models import build_segmentor
@@ -126,7 +126,7 @@ def pytorch2libtorch(model,
         print(traced_model.graph)
 
     traced_model.save(output_file)
-    print('Successfully exported TorchScript model: {}'.format(output_file))
+    print(f'Successfully exported TorchScript model: {output_file}')
 
 
 def parse_args():
@@ -163,7 +163,7 @@ def parse_args():
     else:
         raise ValueError('invalid input shape')
 
-    cfg = mmcv.Config.fromfile(args.config)
+    cfg = Config.fromfile(args.config)
     cfg.model.pretrained = None
 
     # build the model and load checkpoint
diff --git a/tools/get_flops.py b/tools/get_flops.py
deleted file mode 100644
index e30c36fdfc..0000000000
--- a/tools/get_flops.py
+++ /dev/null
@@ -1,60 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-
-from mmcv import Config
-from mmcv.cnn import get_model_complexity_info
-
-from mmseg.models import build_segmentor
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Get the FLOPs of a segmentor')
-    parser.add_argument('config', help='train config file path')
-    parser.add_argument(
-        '--shape',
-        type=int,
-        nargs='+',
-        default=[2048, 1024],
-        help='input image size')
-    args = parser.parse_args()
-    return args
-
-
-def main():
-
-    args = parse_args()
-
-    if len(args.shape) == 1:
-        input_shape = (3, args.shape[0], args.shape[0])
-    elif len(args.shape) == 2:
-        input_shape = (3, ) + tuple(args.shape)
-    else:
-        raise ValueError('invalid input shape')
-
-    cfg = Config.fromfile(args.config)
-    cfg.model.pretrained = None
-    model = build_segmentor(
-        cfg.model,
-        train_cfg=cfg.get('train_cfg'),
-        test_cfg=cfg.get('test_cfg')).cuda()
-    model.eval()
-
-    if hasattr(model, 'forward_dummy'):
-        model.forward = model.forward_dummy
-    else:
-        raise NotImplementedError(
-            'FLOPs counter is currently not currently supported with {}'.
-            format(model.__class__.__name__))
-
-    flops, params = get_model_complexity_info(model, input_shape)
-    split_line = '=' * 30
-    print('{0}\nInput shape: {1}\nFlops: {2}\nParams: {3}\n{0}'.format(
-        split_line, input_shape, flops, params))
-    print('!!!Please be cautious if you use the results in papers. '
-          'You may need to check if all ops are supported and verify that the '
-          'flops computation is correct.')
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/misc/browse_dataset.py b/tools/misc/browse_dataset.py
new file mode 100644
index 0000000000..7863eb74f2
--- /dev/null
+++ b/tools/misc/browse_dataset.py
@@ -0,0 +1,73 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+
+from mmengine import Config, DictAction
+from mmengine.registry import init_default_scope
+from mmengine.utils import ProgressBar
+
+from mmseg.registry import DATASETS, VISUALIZERS
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(description='Browse a dataset')
+    parser.add_argument('config', help='train config file path')
+    parser.add_argument(
+        '--output-dir',
+        default=None,
+        type=str,
+        help='If there is no display interface, you can save it')
+    parser.add_argument('--not-show', default=False, action='store_true')
+    parser.add_argument(
+        '--show-interval',
+        type=float,
+        default=2,
+        help='the interval of show (s)')
+    parser.add_argument(
+        '--cfg-options',
+        nargs='+',
+        action=DictAction,
+        help='override some settings in the used config, the key-value pair '
+        'in xxx=yyy format will be merged into config file. If the value to '
+        'be overwritten is a list, it should be like key="[a,b]" or key=a,b '
+        'It also allows nested list/tuple values, e.g. key="[(a,b),(c,d)]" '
+        'Note that the quotation marks are necessary and that no white space '
+        'is allowed.')
+    args = parser.parse_args()
+    return args
+
+
+def main():
+    args = parse_args()
+    cfg = Config.fromfile(args.config)
+    if args.cfg_options is not None:
+        cfg.merge_from_dict(args.cfg_options)
+
+    # register all modules in mmseg into the registries
+    init_default_scope('mmseg')
+
+    dataset = DATASETS.build(cfg.train_dataloader.dataset)
+    cfg.visualizer['save_dir'] = args.output_dir
+    visualizer = VISUALIZERS.build(cfg.visualizer)
+    visualizer.dataset_meta = dataset.METAINFO
+
+    progress_bar = ProgressBar(len(dataset))
+    for item in dataset:
+        img = item['inputs'].permute(1, 2, 0).numpy()
+        data_sample = item['data_samples'].numpy()
+        img_path = osp.basename(item['data_samples'].img_path)
+
+        img = img[..., [2, 1, 0]]  # bgr to rgb
+
+        visualizer.add_datasample(
+            osp.basename(img_path),
+            img,
+            data_sample,
+            show=not args.not_show,
+            wait_time=args.show_interval)
+
+        progress_bar.update()
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/print_config.py b/tools/misc/print_config.py
similarity index 98%
rename from tools/print_config.py
rename to tools/misc/print_config.py
index 3dbbff5ea7..2a1c024a6a 100644
--- a/tools/print_config.py
+++ b/tools/misc/print_config.py
@@ -2,7 +2,7 @@
 import argparse
 import warnings
 
-from mmcv import Config, DictAction
+from mmengine import Config, DictAction
 
 from mmseg.apis import init_model
 
diff --git a/tools/misc/publish_model.py b/tools/misc/publish_model.py
new file mode 100644
index 0000000000..e035ad90e8
--- /dev/null
+++ b/tools/misc/publish_model.py
@@ -0,0 +1,50 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import subprocess
+from hashlib import sha256
+
+import torch
+
+BLOCK_SIZE = 128 * 1024
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description='Process a checkpoint to be published')
+    parser.add_argument('in_file', help='input checkpoint filename')
+    parser.add_argument('out_file', help='output checkpoint filename')
+    args = parser.parse_args()
+    return args
+
+
+def sha256sum(filename: str) -> str:
+    """Compute SHA256 message digest from a file."""
+    hash_func = sha256()
+    byte_array = bytearray(BLOCK_SIZE)
+    memory_view = memoryview(byte_array)
+    with open(filename, 'rb', buffering=0) as file:
+        for block in iter(lambda: file.readinto(memory_view), 0):
+            hash_func.update(memory_view[:block])
+    return hash_func.hexdigest()
+
+
+def process_checkpoint(in_file, out_file):
+    checkpoint = torch.load(in_file, map_location='cpu')
+    # remove optimizer for smaller file size
+    if 'optimizer' in checkpoint:
+        del checkpoint['optimizer']
+    # if it is necessary to remove some sensitive data in checkpoint['meta'],
+    # add the code here.
+    torch.save(checkpoint, out_file)
+    sha = sha256sum(in_file)
+    final_file = out_file.rstrip('.pth') + f'-{sha[:8]}.pth'
+    subprocess.Popen(['mv', out_file, final_file])
+
+
+def main():
+    args = parse_args()
+    process_checkpoint(args.in_file, args.out_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/beit2mmseg.py b/tools/model_converters/beit2mmseg.py
index d23cfdb0b3..20f8f0f450 100644
--- a/tools/model_converters/beit2mmseg.py
+++ b/tools/model_converters/beit2mmseg.py
@@ -3,9 +3,9 @@
 import os.path as osp
 from collections import OrderedDict
 
-import mmcv
+import mmengine
 import torch
-from mmcv.runner import CheckpointLoader
+from mmengine.runner import CheckpointLoader
 
 
 def convert_beit(ckpt):
@@ -48,7 +48,7 @@ def main():
     else:
         state_dict = checkpoint
     weight = convert_beit(state_dict)
-    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
     torch.save(weight, args.dst)
 
 
diff --git a/tools/model_converters/clip2mmseg.py b/tools/model_converters/clip2mmseg.py
new file mode 100644
index 0000000000..9a97e4b04a
--- /dev/null
+++ b/tools/model_converters/clip2mmseg.py
@@ -0,0 +1,163 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_vitlayer(paras):
+    new_para_name = ''
+    if paras[0] == 'ln_1':
+        new_para_name = '.'.join(['ln1'] + paras[1:])
+    elif paras[0] == 'attn':
+        new_para_name = '.'.join(['attn.attn'] + paras[1:])
+    elif paras[0] == 'ln_2':
+        new_para_name = '.'.join(['ln2'] + paras[1:])
+    elif paras[0] == 'mlp':
+        if paras[1] == 'c_fc':
+            new_para_name = '.'.join(['ffn.layers.0.0'] + paras[-1:])
+        else:
+            new_para_name = '.'.join(['ffn.layers.1'] + paras[-1:])
+    else:
+        print(f'Wrong for {paras}')
+    return new_para_name
+
+
+def convert_translayer(paras):
+    new_para_name = ''
+    if paras[0] == 'attn':
+        new_para_name = '.'.join(['attentions.0.attn'] + paras[1:])
+    elif paras[0] == 'ln_1':
+        new_para_name = '.'.join(['norms.0'] + paras[1:])
+    elif paras[0] == 'ln_2':
+        new_para_name = '.'.join(['norms.1'] + paras[1:])
+    elif paras[0] == 'mlp':
+        if paras[1] == 'c_fc':
+            new_para_name = '.'.join(['ffns.0.layers.0.0'] + paras[2:])
+        elif paras[1] == 'c_proj':
+            new_para_name = '.'.join(['ffns.0.layers.1'] + paras[2:])
+        else:
+            print(f'Wrong for {paras}')
+    else:
+        print(f'Wrong for {paras}')
+    return new_para_name
+
+
+def convert_key_name(ckpt, visual_split):
+    new_ckpt = OrderedDict()
+    for k, v in ckpt.items():
+        key_list = k.split('.')
+        if key_list[0] == 'visual':
+            new_transform_name = 'image_encoder'
+            if key_list[1] == 'class_embedding':
+                new_name = '.'.join([new_transform_name, 'cls_token'])
+            elif key_list[1] == 'positional_embedding':
+                new_name = '.'.join([new_transform_name, 'pos_embed'])
+            elif key_list[1] == 'conv1':
+                new_name = '.'.join([
+                    new_transform_name, 'patch_embed.projection', key_list[2]
+                ])
+            elif key_list[1] == 'ln_pre':
+                new_name = '.'.join(
+                    [new_transform_name, key_list[1], key_list[2]])
+            elif key_list[1] == 'transformer':
+                new_layer_name = 'layers'
+                layer_index = key_list[3]
+                paras = key_list[4:]
+                if int(layer_index) < visual_split:
+                    new_para_name = convert_vitlayer(paras)
+                    new_name = '.'.join([
+                        new_transform_name, new_layer_name, layer_index,
+                        new_para_name
+                    ])
+                else:
+                    new_para_name = convert_translayer(paras)
+                    new_transform_name = 'decode_head.rec_with_attnbias'
+                    new_layer_name = 'layers'
+                    layer_index = str(int(layer_index) - visual_split)
+                    new_name = '.'.join([
+                        new_transform_name, new_layer_name, layer_index,
+                        new_para_name
+                    ])
+            elif key_list[1] == 'proj':
+                new_name = 'decode_head.rec_with_attnbias.proj.weight'
+            elif key_list[1] == 'ln_post':
+                new_name = k.replace('visual', 'decode_head.rec_with_attnbias')
+            else:
+                print(f'pop parameter: {k}')
+                continue
+        else:
+            text_encoder_name = 'text_encoder'
+            if key_list[0] == 'transformer':
+                layer_name = 'transformer'
+                layer_index = key_list[2]
+                paras = key_list[3:]
+                new_para_name = convert_translayer(paras)
+                new_name = '.'.join([
+                    text_encoder_name, layer_name, layer_index, new_para_name
+                ])
+            elif key_list[0] in [
+                    'positional_embedding', 'text_projection', 'bg_embed',
+                    'attn_mask', 'logit_scale', 'token_embedding', 'ln_final'
+            ]:
+                new_name = 'text_encoder.' + k
+            else:
+                print(f'pop parameter: {k}')
+                continue
+        new_ckpt[new_name] = v
+
+    return new_ckpt
+
+
+def convert_tensor(ckpt):
+    cls_token = ckpt['image_encoder.cls_token']
+    new_cls_token = cls_token.unsqueeze(0).unsqueeze(0)
+    ckpt['image_encoder.cls_token'] = new_cls_token
+    pos_embed = ckpt['image_encoder.pos_embed']
+    new_pos_embed = pos_embed.unsqueeze(0)
+    ckpt['image_encoder.pos_embed'] = new_pos_embed
+    proj_weight = ckpt['decode_head.rec_with_attnbias.proj.weight']
+    new_proj_weight = proj_weight.transpose(1, 0)
+    ckpt['decode_head.rec_with_attnbias.proj.weight'] = new_proj_weight
+    return ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in timm pretrained vit models to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    if any([s in args.src for s in ['B-16', 'b16', 'base_patch16']]):
+        visual_split = 9
+    elif any([s in args.src for s in ['L-14', 'l14', 'large_patch14']]):
+        visual_split = 18
+    else:
+        print('Make sure the clip model is ViT-B/16 or ViT-L/14!')
+        visual_split = -1
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if isinstance(checkpoint, torch.jit.RecursiveScriptModule):
+        state_dict = checkpoint.state_dict()
+    else:
+        if 'state_dict' in checkpoint:
+            # timm checkpoint
+            state_dict = checkpoint['state_dict']
+        elif 'model' in checkpoint:
+            # deit checkpoint
+            state_dict = checkpoint['model']
+        else:
+            state_dict = checkpoint
+    weight = convert_key_name(state_dict, visual_split)
+    weight = convert_tensor(weight)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/mit2mmseg.py b/tools/model_converters/mit2mmseg.py
index 2eff1f7b7a..f10cbbf9d4 100644
--- a/tools/model_converters/mit2mmseg.py
+++ b/tools/model_converters/mit2mmseg.py
@@ -3,9 +3,9 @@
 import os.path as osp
 from collections import OrderedDict
 
-import mmcv
+import mmengine
 import torch
-from mmcv.runner import CheckpointLoader
+from mmengine.runner import CheckpointLoader
 
 
 def convert_mit(ckpt):
@@ -74,7 +74,7 @@ def main():
     else:
         state_dict = checkpoint
     weight = convert_mit(state_dict)
-    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
     torch.save(weight, args.dst)
 
 
diff --git a/tools/model_converters/san2mmseg.py b/tools/model_converters/san2mmseg.py
new file mode 100644
index 0000000000..301a46608e
--- /dev/null
+++ b/tools/model_converters/san2mmseg.py
@@ -0,0 +1,220 @@
+# Copyright (c) OpenMMLab. All rights reserved.
+import argparse
+import os.path as osp
+from collections import OrderedDict
+
+import mmengine
+import torch
+from mmengine.runner import CheckpointLoader
+
+
+def convert_key_name(ckpt):
+    new_ckpt = OrderedDict()
+
+    for k, v in ckpt.items():
+        key_list = k.split('.')
+        if key_list[0] == 'clip_visual_extractor':
+            new_transform_name = 'image_encoder'
+            if key_list[1] == 'class_embedding':
+                new_name = '.'.join([new_transform_name, 'cls_token'])
+            elif key_list[1] == 'positional_embedding':
+                new_name = '.'.join([new_transform_name, 'pos_embed'])
+            elif key_list[1] == 'conv1':
+                new_name = '.'.join([
+                    new_transform_name, 'patch_embed.projection', key_list[2]
+                ])
+            elif key_list[1] == 'ln_pre':
+                new_name = '.'.join(
+                    [new_transform_name, key_list[1], key_list[2]])
+            elif key_list[1] == 'resblocks':
+                new_layer_name = 'layers'
+                layer_index = key_list[2]
+                paras = key_list[3:]
+                if paras[0] == 'ln_1':
+                    new_para_name = '.'.join(['ln1'] + key_list[4:])
+                elif paras[0] == 'attn':
+                    new_para_name = '.'.join(['attn.attn'] + key_list[4:])
+                elif paras[0] == 'ln_2':
+                    new_para_name = '.'.join(['ln2'] + key_list[4:])
+                elif paras[0] == 'mlp':
+                    if paras[1] == 'c_fc':
+                        new_para_name = '.'.join(['ffn.layers.0.0'] +
+                                                 key_list[-1:])
+                    else:
+                        new_para_name = '.'.join(['ffn.layers.1'] +
+                                                 key_list[-1:])
+                new_name = '.'.join([
+                    new_transform_name, new_layer_name, layer_index,
+                    new_para_name
+                ])
+        elif key_list[0] == 'side_adapter_network':
+            decode_head_name = 'decode_head'
+            module_name = 'side_adapter_network'
+            if key_list[1] == 'vit_model':
+                if key_list[2] == 'blocks':
+                    layer_name = 'encode_layers'
+                    layer_index = key_list[3]
+                    paras = key_list[4:]
+                    if paras[0] == 'norm1':
+                        new_para_name = '.'.join(['ln1'] + key_list[5:])
+                    elif paras[0] == 'attn':
+                        new_para_name = '.'.join(key_list[4:])
+                        new_para_name = new_para_name.replace(
+                            'attn.qkv.', 'attn.attn.in_proj_')
+                        new_para_name = new_para_name.replace(
+                            'attn.proj', 'attn.attn.out_proj')
+                    elif paras[0] == 'norm2':
+                        new_para_name = '.'.join(['ln2'] + key_list[5:])
+                    elif paras[0] == 'mlp':
+                        new_para_name = '.'.join(['ffn'] + key_list[5:])
+                        new_para_name = new_para_name.replace(
+                            'fc1', 'layers.0.0')
+                        new_para_name = new_para_name.replace(
+                            'fc2', 'layers.1')
+                    else:
+                        print(f'Wrong for {k}')
+                    new_name = '.'.join([
+                        decode_head_name, module_name, layer_name, layer_index,
+                        new_para_name
+                    ])
+                elif key_list[2] == 'pos_embed':
+                    new_name = '.'.join(
+                        [decode_head_name, module_name, 'pos_embed'])
+                elif key_list[2] == 'patch_embed':
+                    new_name = '.'.join([
+                        decode_head_name, module_name, 'patch_embed',
+                        'projection', key_list[4]
+                    ])
+                else:
+                    print(f'Wrong for {k}')
+            elif key_list[1] == 'query_embed' or key_list[
+                    1] == 'query_pos_embed':
+                new_name = '.'.join(
+                    [decode_head_name, module_name, key_list[1]])
+            elif key_list[1] == 'fusion_layers':
+                layer_name = 'conv_clips'
+                layer_index = key_list[2][-1]
+                paras = '.'.join(key_list[3:])
+                new_para_name = paras.replace('input_proj.0', '0')
+                new_para_name = new_para_name.replace('input_proj.1', '1.conv')
+                new_name = '.'.join([
+                    decode_head_name, module_name, layer_name, layer_index,
+                    new_para_name
+                ])
+            elif key_list[1] == 'mask_decoder':
+                new_name = 'decode_head.' + k
+            else:
+                print(f'Wrong for {k}')
+        elif key_list[0] == 'clip_rec_head':
+            module_name = 'rec_with_attnbias'
+            if key_list[1] == 'proj':
+                new_name = '.'.join(
+                    [decode_head_name, module_name, 'proj.weight'])
+            elif key_list[1] == 'ln_post':
+                new_name = '.'.join(
+                    [decode_head_name, module_name, 'ln_post', key_list[2]])
+            elif key_list[1] == 'resblocks':
+                new_layer_name = 'layers'
+                layer_index = key_list[2]
+                paras = key_list[3:]
+                if paras[0] == 'ln_1':
+                    new_para_name = '.'.join(['norms.0'] + paras[1:])
+                elif paras[0] == 'attn':
+                    new_para_name = '.'.join(['attentions.0.attn'] + paras[1:])
+                elif paras[0] == 'ln_2':
+                    new_para_name = '.'.join(['norms.1'] + paras[1:])
+                elif paras[0] == 'mlp':
+                    if paras[1] == 'c_fc':
+                        new_para_name = '.'.join(['ffns.0.layers.0.0'] +
+                                                 paras[2:])
+                    elif paras[1] == 'c_proj':
+                        new_para_name = '.'.join(['ffns.0.layers.1'] +
+                                                 paras[2:])
+                    else:
+                        print(f'Wrong for {k}')
+                new_name = '.'.join([
+                    decode_head_name, module_name, new_layer_name, layer_index,
+                    new_para_name
+                ])
+            else:
+                print(f'Wrong for {k}')
+        elif key_list[0] == 'ov_classifier':
+            text_encoder_name = 'text_encoder'
+            if key_list[1] == 'transformer':
+                layer_name = 'transformer'
+                layer_index = key_list[3]
+                paras = key_list[4:]
+                if paras[0] == 'attn':
+                    new_para_name = '.'.join(['attentions.0.attn'] + paras[1:])
+                elif paras[0] == 'ln_1':
+                    new_para_name = '.'.join(['norms.0'] + paras[1:])
+                elif paras[0] == 'ln_2':
+                    new_para_name = '.'.join(['norms.1'] + paras[1:])
+                elif paras[0] == 'mlp':
+                    if paras[1] == 'c_fc':
+                        new_para_name = '.'.join(['ffns.0.layers.0.0'] +
+                                                 paras[2:])
+                    elif paras[1] == 'c_proj':
+                        new_para_name = '.'.join(['ffns.0.layers.1'] +
+                                                 paras[2:])
+                    else:
+                        print(f'Wrong for {k}')
+                else:
+                    print(f'Wrong for {k}')
+                new_name = '.'.join([
+                    text_encoder_name, layer_name, layer_index, new_para_name
+                ])
+            elif key_list[1] in [
+                    'positional_embedding', 'text_projection', 'bg_embed',
+                    'attn_mask', 'logit_scale', 'token_embedding', 'ln_final'
+            ]:
+                new_name = k.replace('ov_classifier', 'text_encoder')
+            else:
+                print(f'Wrong for {k}')
+        elif key_list[0] == 'criterion':
+            new_name = k
+        else:
+            print(f'Wrong for {k}')
+        new_ckpt[new_name] = v
+    return new_ckpt
+
+
+def convert_tensor(ckpt):
+    cls_token = ckpt['image_encoder.cls_token']
+    new_cls_token = cls_token.unsqueeze(0).unsqueeze(0)
+    ckpt['image_encoder.cls_token'] = new_cls_token
+    pos_embed = ckpt['image_encoder.pos_embed']
+    new_pos_embed = pos_embed.unsqueeze(0)
+    ckpt['image_encoder.pos_embed'] = new_pos_embed
+    proj_weight = ckpt['decode_head.rec_with_attnbias.proj.weight']
+    new_proj_weight = proj_weight.transpose(1, 0)
+    ckpt['decode_head.rec_with_attnbias.proj.weight'] = new_proj_weight
+    return ckpt
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description='Convert keys in timm pretrained vit models to '
+        'MMSegmentation style.')
+    parser.add_argument('src', help='src model path or url')
+    # The dst path must be a full path of the new checkpoint.
+    parser.add_argument('dst', help='save path')
+    args = parser.parse_args()
+
+    checkpoint = CheckpointLoader.load_checkpoint(args.src, map_location='cpu')
+    if 'state_dict' in checkpoint:
+        # timm checkpoint
+        state_dict = checkpoint['state_dict']
+    elif 'model' in checkpoint:
+        # deit checkpoint
+        state_dict = checkpoint['model']
+    else:
+        state_dict = checkpoint
+    weight = convert_key_name(state_dict)
+    weight = convert_tensor(weight)
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
+    torch.save(weight, args.dst)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/tools/model_converters/stdc2mmseg.py b/tools/model_converters/stdc2mmseg.py
index 9241f86a15..6ea3b8342f 100644
--- a/tools/model_converters/stdc2mmseg.py
+++ b/tools/model_converters/stdc2mmseg.py
@@ -2,9 +2,9 @@
 import argparse
 import os.path as osp
 
-import mmcv
+import mmengine
 import torch
-from mmcv.runner import CheckpointLoader
+from mmengine.runner import CheckpointLoader
 
 
 def convert_stdc(ckpt, stdc_type):
@@ -63,7 +63,7 @@ def main():
     assert args.type in ['STDC1',
                          'STDC2'], 'STD type should be STDC1 or STDC2!'
     weight = convert_stdc(state_dict, args.type)
-    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
     torch.save(weight, args.dst)
 
 
diff --git a/tools/model_converters/swin2mmseg.py b/tools/model_converters/swin2mmseg.py
index 03b24ceaa4..d434f9465b 100644
--- a/tools/model_converters/swin2mmseg.py
+++ b/tools/model_converters/swin2mmseg.py
@@ -3,9 +3,9 @@
 import os.path as osp
 from collections import OrderedDict
 
-import mmcv
+import mmengine
 import torch
-from mmcv.runner import CheckpointLoader
+from mmengine.runner import CheckpointLoader
 
 
 def convert_swin(ckpt):
@@ -79,7 +79,7 @@ def main():
     else:
         state_dict = checkpoint
     weight = convert_swin(state_dict)
-    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
     torch.save(weight, args.dst)
 
 
diff --git a/tools/model_converters/twins2mmseg.py b/tools/model_converters/twins2mmseg.py
index ab64aa526e..647d41784a 100644
--- a/tools/model_converters/twins2mmseg.py
+++ b/tools/model_converters/twins2mmseg.py
@@ -3,9 +3,9 @@
 import os.path as osp
 from collections import OrderedDict
 
-import mmcv
+import mmengine
 import torch
-from mmcv.runner import CheckpointLoader
+from mmengine.runner import CheckpointLoader
 
 
 def convert_twins(args, ckpt):
@@ -79,7 +79,7 @@ def main():
         state_dict = checkpoint
 
     weight = convert_twins(args, state_dict)
-    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
     torch.save(weight, args.dst)
 
 
diff --git a/tools/model_converters/vit2mmseg.py b/tools/model_converters/vit2mmseg.py
index bc18ebed88..1d1f8a427e 100644
--- a/tools/model_converters/vit2mmseg.py
+++ b/tools/model_converters/vit2mmseg.py
@@ -3,9 +3,9 @@
 import os.path as osp
 from collections import OrderedDict
 
-import mmcv
+import mmengine
 import torch
-from mmcv.runner import CheckpointLoader
+from mmengine.runner import CheckpointLoader
 
 
 def convert_vit(ckpt):
@@ -62,7 +62,7 @@ def main():
     else:
         state_dict = checkpoint
     weight = convert_vit(state_dict)
-    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
     torch.save(weight, args.dst)
 
 
diff --git a/tools/model_converters/vitjax2mmseg.py b/tools/model_converters/vitjax2mmseg.py
index 585f408368..81bc2ea020 100644
--- a/tools/model_converters/vitjax2mmseg.py
+++ b/tools/model_converters/vitjax2mmseg.py
@@ -2,7 +2,7 @@
 import argparse
 import os.path as osp
 
-import mmcv
+import mmengine
 import numpy as np
 import torch
 
@@ -115,7 +115,7 @@ def main():
     else:
         num_layer = 12
     torch_weights = vit_jax_to_torch(jax_weights_tensor, num_layer)
-    mmcv.mkdir_or_exist(osp.dirname(args.dst))
+    mmengine.mkdir_or_exist(osp.dirname(args.dst))
     torch.save(torch_weights, args.dst)
 
 
diff --git a/tools/publish_model.py b/tools/publish_model.py
deleted file mode 100644
index e2660578af..0000000000
--- a/tools/publish_model.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import argparse
-import subprocess
-
-import torch
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description='Process a checkpoint to be published')
-    parser.add_argument('in_file', help='input checkpoint filename')
-    parser.add_argument('out_file', help='output checkpoint filename')
-    args = parser.parse_args()
-    return args
-
-
-def process_checkpoint(in_file, out_file):
-    checkpoint = torch.load(in_file, map_location='cpu')
-    # remove optimizer for smaller file size
-    if 'optimizer' in checkpoint:
-        del checkpoint['optimizer']
-    # if it is necessary to remove some sensitive data in checkpoint['meta'],
-    # add the code here.
-    torch.save(checkpoint, out_file)
-    sha = subprocess.check_output(['sha256sum', out_file]).decode()
-    final_file = out_file.rstrip('.pth') + '-{}.pth'.format(sha[:8])
-    subprocess.Popen(['mv', out_file, final_file])
-
-
-def main():
-    args = parse_args()
-    process_checkpoint(args.in_file, args.out_file)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/tools/test.py b/tools/test.py
index e4e1b5d4d7..0d7f39b3a8 100644
--- a/tools/test.py
+++ b/tools/test.py
@@ -6,8 +6,6 @@
 from mmengine.config import Config, DictAction
 from mmengine.runner import Runner
 
-from mmseg.utils import register_all_modules
-
 
 # TODO: support fuse_conv_bn, visualization, and format_only
 def parse_args():
@@ -19,6 +17,19 @@ def parse_args():
         '--work-dir',
         help=('if specified, the evaluation metric results will be dumped'
               'into the directory as json'))
+    parser.add_argument(
+        '--out',
+        type=str,
+        help='The directory to save output prediction for offline evaluation')
+    parser.add_argument(
+        '--show', action='store_true', help='show prediction results')
+    parser.add_argument(
+        '--show-dir',
+        help='directory where painted images will be saved. '
+        'If specified, it will be automatically saved '
+        'to the work_dir/timestamp/show_dir')
+    parser.add_argument(
+        '--wait-time', type=float, default=2, help='the interval of show (s)')
     parser.add_argument(
         '--cfg-options',
         nargs='+',
@@ -34,7 +45,12 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
+    parser.add_argument(
+        '--tta', action='store_true', help='Test time augmentation')
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
@@ -42,13 +58,30 @@ def parse_args():
     return args
 
 
+def trigger_visualization_hook(cfg, args):
+    default_hooks = cfg.default_hooks
+    if 'visualization' in default_hooks:
+        visualization_hook = default_hooks['visualization']
+        # Turn on visualization
+        visualization_hook['draw'] = True
+        if args.show:
+            visualization_hook['show'] = True
+            visualization_hook['wait_time'] = args.wait_time
+        if args.show_dir:
+            visualizer = cfg.visualizer
+            visualizer['save_dir'] = args.show_dir
+    else:
+        raise RuntimeError(
+            'VisualizationHook must be included in default_hooks.'
+            'refer to usage '
+            '"visualization=dict(type=\'VisualizationHook\')"')
+
+    return cfg
+
+
 def main():
     args = parse_args()
 
-    # register all modules in mmseg into the registries
-    # do not init the default scope here because it will be init in the runner
-    register_all_modules(init_default_scope=False)
-
     # load config
     cfg = Config.fromfile(args.config)
     cfg.launcher = args.launcher
@@ -66,6 +99,19 @@ def main():
 
     cfg.load_from = args.checkpoint
 
+    if args.show or args.show_dir:
+        cfg = trigger_visualization_hook(cfg, args)
+
+    if args.tta:
+        cfg.test_dataloader.dataset.pipeline = cfg.tta_pipeline
+        cfg.tta_model.module = cfg.model
+        cfg.model = cfg.tta_model
+
+    # add output_dir in metric
+    if args.out is not None:
+        cfg.test_evaluator['output_dir'] = args.out
+        cfg.test_evaluator['keep_results'] = True
+
     # build the runner from config
     runner = Runner.from_cfg(cfg)
 
diff --git a/tools/torchserve/mmseg2torchserve.py b/tools/torchserve/mmseg2torchserve.py
index 9063634845..23f99638e7 100644
--- a/tools/torchserve/mmseg2torchserve.py
+++ b/tools/torchserve/mmseg2torchserve.py
@@ -3,7 +3,8 @@
 from pathlib import Path
 from tempfile import TemporaryDirectory
 
-import mmcv
+from mmengine import Config
+from mmengine.utils import mkdir_or_exist
 
 try:
     from model_archiver.model_packaging import package_model
@@ -43,9 +44,9 @@ def mmseg2torchserve(
             If True, if there is an existing `{model_name}.mar`
             file under `output_folder` it will be overwritten.
     """
-    mmcv.mkdir_or_exist(output_folder)
+    mkdir_or_exist(output_folder)
 
-    config = mmcv.Config.fromfile(config_file)
+    config = Config.fromfile(config_file)
 
     with TemporaryDirectory() as tmpdir:
         config.dump(f'{tmpdir}/config.py')
diff --git a/tools/torchserve/mmseg_handler.py b/tools/torchserve/mmseg_handler.py
index fb6809df15..dbe5ded848 100644
--- a/tools/torchserve/mmseg_handler.py
+++ b/tools/torchserve/mmseg_handler.py
@@ -5,7 +5,7 @@
 import cv2
 import mmcv
 import torch
-from mmcv.cnn.utils.sync_bn import revert_sync_batchnorm
+from mmengine.model.utils import revert_sync_batchnorm
 from ts.torch_handler.base_handler import BaseHandler
 
 from mmseg.apis import inference_model, init_model
diff --git a/tools/train.py b/tools/train.py
index 878d78c31c..10fdaa1874 100644
--- a/tools/train.py
+++ b/tools/train.py
@@ -8,13 +8,18 @@
 from mmengine.logging import print_log
 from mmengine.runner import Runner
 
-from mmseg.utils import register_all_modules
+from mmseg.registry import RUNNERS
 
 
 def parse_args():
     parser = argparse.ArgumentParser(description='Train a segmentor')
     parser.add_argument('config', help='train config file path')
     parser.add_argument('--work-dir', help='the dir to save logs and models')
+    parser.add_argument(
+        '--resume',
+        action='store_true',
+        default=False,
+        help='resume from the latest checkpoint in the work_dir automatically')
     parser.add_argument(
         '--amp',
         action='store_true',
@@ -35,7 +40,10 @@ def parse_args():
         choices=['none', 'pytorch', 'slurm', 'mpi'],
         default='none',
         help='job launcher')
-    parser.add_argument('--local_rank', type=int, default=0)
+    # When using PyTorch version >= 2.0.0, the `torch.distributed.launch`
+    # will pass the `--local-rank` parameter to `tools/train.py` instead
+    # of `--local_rank`.
+    parser.add_argument('--local_rank', '--local-rank', type=int, default=0)
     args = parser.parse_args()
     if 'LOCAL_RANK' not in os.environ:
         os.environ['LOCAL_RANK'] = str(args.local_rank)
@@ -46,10 +54,6 @@ def parse_args():
 def main():
     args = parse_args()
 
-    # register all modules in mmseg into the registries
-    # do not init the default scope here because it will be init in the runner
-    register_all_modules(init_default_scope=False)
-
     # load config
     cfg = Config.fromfile(args.config)
     cfg.launcher = args.launcher
@@ -80,8 +84,17 @@ def main():
             cfg.optim_wrapper.type = 'AmpOptimWrapper'
             cfg.optim_wrapper.loss_scale = 'dynamic'
 
+    # resume training
+    cfg.resume = args.resume
+
     # build the runner from config
-    runner = Runner.from_cfg(cfg)
+    if 'runner_type' not in cfg:
+        # build the default runner
+        runner = Runner.from_cfg(cfg)
+    else:
+        # build customized runner from the registry
+        # if 'runner_type' is set in the cfg
+        runner = RUNNERS.build(cfg)
 
     # start training
     runner.train()
diff --git a/zero_mould_v1_cfg.py b/zero_mould_v1_cfg.py
new file mode 100644
index 0000000000..3be77b8f66
--- /dev/null
+++ b/zero_mould_v1_cfg.py
@@ -0,0 +1,287 @@
+# https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/README.md
+crop_size = (
+    256,
+    256,
+)
+data_preprocessor = dict(
+    bgr_to_rgb=True,
+    mean=[
+        123.675,
+        116.28,
+        103.53,
+    ],
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    std=[
+        58.395,
+        57.12,
+        57.375,
+    ],
+    type='SegDataPreProcessor')
+data_root = 'data/zero_mould_v1/'
+dataset_type = 'ZeroMouldV1Dataset'
+default_hooks = dict(
+    checkpoint=dict(by_epoch=False, interval=4000, type='CheckpointHook'),
+    logger=dict(interval=50, log_metric_by_epoch=False, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='SegVisualizationHook'))
+default_scope = 'mmseg'
+device = 'cuda'
+env_cfg = dict(
+    cudnn_benchmark=True,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+gpu_ids = range(0, 1)
+img_ratios = [
+    0.5,
+    0.75,
+    1.0,
+    1.25,
+    1.5,
+    1.75,
+]
+load_from = 'checkpoints/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032-59daf7a4.pth'
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+model = dict(
+    auxiliary_head=dict(
+        align_corners=False,
+        channels=256,
+        concat_input=False,
+        dropout_ratio=0.1,
+        in_channels=1024,
+        in_index=2,
+        loss_decode=dict(
+            loss_weight=0.4, type='CrossEntropyLoss', use_sigmoid=False),
+        norm_cfg=dict(requires_grad=True, type='BN'),
+        num_classes=5,
+        num_convs=1,
+        type='FCNHead'),
+    backbone=dict(
+        contract_dilation=True,
+        depth=50,
+        dilations=(
+            1,
+            1,
+            2,
+            4,
+        ),
+        norm_cfg=dict(requires_grad=True, type='BN'),
+        norm_eval=False,
+        num_stages=4,
+        out_indices=(
+            0,
+            1,
+            2,
+            3,
+        ),
+        strides=(
+            1,
+            2,
+            1,
+            1,
+        ),
+        style='pytorch',
+        type='ResNetV1c'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_val=0,
+        seg_pad_val=255,
+        size=crop_size,
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='SegDataPreProcessor'),
+    decode_head=dict(
+        align_corners=False,
+        channels=512,
+        dropout_ratio=0.1,
+        in_channels=2048,
+        in_index=3,
+        loss_decode=dict(
+            loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=False),
+        norm_cfg=dict(requires_grad=True, type='BN'),
+        num_classes=5,
+        pool_scales=(
+            1,
+            2,
+            3,
+            6,
+        ),
+        type='PSPHead'),
+    pretrained='open-mmlab://resnet50_v1c',
+    test_cfg=dict(mode='whole'),
+    train_cfg=dict(),
+    type='EncoderDecoder')
+norm_cfg = dict(requires_grad=True, type='BN')
+optim_wrapper = dict(
+    clip_grad=None,
+    optimizer=dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0005),
+    type='OptimWrapper')
+optimizer = dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0005)
+param_scheduler = [
+    dict(
+        begin=0,
+        by_epoch=False,
+        end=40000,
+        eta_min=0.0001,
+        power=0.9,
+        type='PolyLR'),
+]
+resume = False
+seed = 0
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='img_dir', seg_map_path='ann_dir'),
+        data_root=data_root,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                2048,
+                1024,
+            ), type='Resize'),
+            dict(type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type=dataset_type),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        2048,
+        1024,
+    ), type='Resize'),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs'),
+]
+train_cfg = dict(max_iters=1000, type='IterBasedTrainLoop', val_interval=100)
+# train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000)
+train_dataloader = dict(
+    batch_size=2,
+    drop_last=False,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='img_dir', seg_map_path='ann_dir'),
+        data_root=data_root,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations'),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.5,
+                    2.0,
+                ),
+                scale=(
+                    2048,
+                    1024,
+                ),
+                type='RandomResize'),
+            dict(
+                cat_max_ratio=0.75, crop_size=crop_size, type='RandomCrop'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(type='PhotoMetricDistortion'),
+            dict(type='PackSegInputs'),
+        ],
+        type=dataset_type),
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='InfiniteSampler'))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.5,
+            2.0,
+        ),
+        scale=(
+            2048,
+            1024,
+        ),
+        type='RandomResize'),
+    dict(cat_max_ratio=0.75, crop_size=crop_size, type='RandomCrop'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+tta_model = dict(type='SegTTAModel')
+tta_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(
+        transforms=[
+            [
+                dict(keep_ratio=True, scale_factor=0.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=0.75, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.0, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.25, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.75, type='Resize'),
+            ],
+            [
+                dict(direction='horizontal', prob=0.0, type='RandomFlip'),
+                dict(direction='horizontal', prob=1.0, type='RandomFlip'),
+            ],
+            [
+                dict(type='LoadAnnotations'),
+            ],
+            [
+                dict(type='PackSegInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='img_dir', seg_map_path='ann_dir'),
+        data_root=data_root,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                2048,
+                1024,
+            ), type='Resize'),
+            dict(type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type=dataset_type),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='SegLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './work_dirs/zero_mould_v1'
diff --git a/zero_mould_v2_cfg.py b/zero_mould_v2_cfg.py
new file mode 100644
index 0000000000..34c6a8babb
--- /dev/null
+++ b/zero_mould_v2_cfg.py
@@ -0,0 +1,287 @@
+# https://github.com/open-mmlab/mmsegmentation/blob/main/configs/unet/README.md
+crop_size = (
+    256,
+    256,
+)
+data_preprocessor = dict(
+    bgr_to_rgb=True,
+    mean=[
+        123.675,
+        116.28,
+        103.53,
+    ],
+    pad_val=0,
+    seg_pad_val=255,
+    size=crop_size,
+    std=[
+        58.395,
+        57.12,
+        57.375,
+    ],
+    type='SegDataPreProcessor')
+data_root = 'data/zero_mould_v2/'
+dataset_type = 'ZeroMouldV2Dataset'
+default_hooks = dict(
+    checkpoint=dict(by_epoch=False, interval=4000, type='CheckpointHook'),
+    logger=dict(interval=50, log_metric_by_epoch=False, type='LoggerHook'),
+    param_scheduler=dict(type='ParamSchedulerHook'),
+    sampler_seed=dict(type='DistSamplerSeedHook'),
+    timer=dict(type='IterTimerHook'),
+    visualization=dict(type='SegVisualizationHook'))
+default_scope = 'mmseg'
+device = 'cuda'
+env_cfg = dict(
+    cudnn_benchmark=True,
+    dist_cfg=dict(backend='nccl'),
+    mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0))
+gpu_ids = range(0, 1)
+img_ratios = [
+    0.5,
+    0.75,
+    1.0,
+    1.25,
+    1.5,
+    1.75,
+]
+load_from = 'checkpoints/deeplabv3_unet_s5-d16_ce-1.0-dice-3.0_256x256_40k_hrf_20211210_202032-59daf7a4.pth'
+log_level = 'INFO'
+log_processor = dict(by_epoch=False)
+model = dict(
+    auxiliary_head=dict(
+        align_corners=False,
+        channels=256,
+        concat_input=False,
+        dropout_ratio=0.1,
+        in_channels=1024,
+        in_index=2,
+        loss_decode=dict(
+            loss_weight=0.4, type='CrossEntropyLoss', use_sigmoid=False),
+        norm_cfg=dict(requires_grad=True, type='BN'),
+        num_classes=2,
+        num_convs=1,
+        type='FCNHead'),
+    backbone=dict(
+        contract_dilation=True,
+        depth=50,
+        dilations=(
+            1,
+            1,
+            2,
+            4,
+        ),
+        norm_cfg=dict(requires_grad=True, type='BN'),
+        norm_eval=False,
+        num_stages=4,
+        out_indices=(
+            0,
+            1,
+            2,
+            3,
+        ),
+        strides=(
+            1,
+            2,
+            1,
+            1,
+        ),
+        style='pytorch',
+        type='ResNetV1c'),
+    data_preprocessor=dict(
+        bgr_to_rgb=True,
+        mean=[
+            123.675,
+            116.28,
+            103.53,
+        ],
+        pad_val=0,
+        seg_pad_val=255,
+        size=crop_size,
+        std=[
+            58.395,
+            57.12,
+            57.375,
+        ],
+        type='SegDataPreProcessor'),
+    decode_head=dict(
+        align_corners=False,
+        channels=512,
+        dropout_ratio=0.1,
+        in_channels=2048,
+        in_index=3,
+        loss_decode=dict(
+            loss_weight=1.0, type='CrossEntropyLoss', use_sigmoid=False),
+        norm_cfg=dict(requires_grad=True, type='BN'),
+        num_classes=2,
+        pool_scales=(
+            1,
+            2,
+            3,
+            6,
+        ),
+        type='PSPHead'),
+    pretrained='open-mmlab://resnet50_v1c',
+    test_cfg=dict(mode='whole'),
+    train_cfg=dict(),
+    type='EncoderDecoder')
+norm_cfg = dict(requires_grad=True, type='BN')
+optim_wrapper = dict(
+    clip_grad=None,
+    optimizer=dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0005),
+    type='OptimWrapper')
+optimizer = dict(lr=0.01, momentum=0.9, type='SGD', weight_decay=0.0005)
+param_scheduler = [
+    dict(
+        begin=0,
+        by_epoch=False,
+        end=40000,
+        eta_min=0.0001,
+        power=0.9,
+        type='PolyLR'),
+]
+resume = False
+seed = 0
+test_cfg = dict(type='TestLoop')
+test_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='img_dir', seg_map_path='ann_dir'),
+        data_root=data_root,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                2048,
+                1024,
+            ), type='Resize'),
+            dict(type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type=dataset_type),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+test_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+test_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(keep_ratio=True, scale=(
+        2048,
+        1024,
+    ), type='Resize'),
+    dict(type='LoadAnnotations'),
+    dict(type='PackSegInputs'),
+]
+train_cfg = dict(max_iters=1000, type='IterBasedTrainLoop', val_interval=100)
+# train_cfg = dict(max_iters=40000, type='IterBasedTrainLoop', val_interval=4000)
+train_dataloader = dict(
+    batch_size=2,
+    drop_last=False,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='img_dir', seg_map_path='ann_dir'),
+        data_root=data_root,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(type='LoadAnnotations'),
+            dict(
+                keep_ratio=True,
+                ratio_range=(
+                    0.5,
+                    2.0,
+                ),
+                scale=(
+                    2048,
+                    1024,
+                ),
+                type='RandomResize'),
+            dict(
+                cat_max_ratio=0.75, crop_size=crop_size, type='RandomCrop'),
+            dict(prob=0.5, type='RandomFlip'),
+            dict(type='PhotoMetricDistortion'),
+            dict(type='PackSegInputs'),
+        ],
+        type=dataset_type),
+    num_workers=2,
+    persistent_workers=True,
+    sampler=dict(shuffle=True, type='InfiniteSampler'))
+train_pipeline = [
+    dict(type='LoadImageFromFile'),
+    dict(type='LoadAnnotations'),
+    dict(
+        keep_ratio=True,
+        ratio_range=(
+            0.5,
+            2.0,
+        ),
+        scale=(
+            2048,
+            1024,
+        ),
+        type='RandomResize'),
+    dict(cat_max_ratio=0.75, crop_size=crop_size, type='RandomCrop'),
+    dict(prob=0.5, type='RandomFlip'),
+    dict(type='PhotoMetricDistortion'),
+    dict(type='PackSegInputs'),
+]
+tta_model = dict(type='SegTTAModel')
+tta_pipeline = [
+    dict(backend_args=None, type='LoadImageFromFile'),
+    dict(
+        transforms=[
+            [
+                dict(keep_ratio=True, scale_factor=0.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=0.75, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.0, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.25, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.5, type='Resize'),
+                dict(keep_ratio=True, scale_factor=1.75, type='Resize'),
+            ],
+            [
+                dict(direction='horizontal', prob=0.0, type='RandomFlip'),
+                dict(direction='horizontal', prob=1.0, type='RandomFlip'),
+            ],
+            [
+                dict(type='LoadAnnotations'),
+            ],
+            [
+                dict(type='PackSegInputs'),
+            ],
+        ],
+        type='TestTimeAug'),
+]
+val_cfg = dict(type='ValLoop')
+val_dataloader = dict(
+    batch_size=1,
+    dataset=dict(
+        data_prefix=dict(
+            img_path='img_dir', seg_map_path='ann_dir'),
+        data_root=data_root,
+        pipeline=[
+            dict(type='LoadImageFromFile'),
+            dict(keep_ratio=True, scale=(
+                2048,
+                1024,
+            ), type='Resize'),
+            dict(type='LoadAnnotations'),
+            dict(type='PackSegInputs'),
+        ],
+        type=dataset_type),
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(shuffle=False, type='DefaultSampler'))
+val_evaluator = dict(
+    iou_metrics=[
+        'mIoU',
+    ], type='IoUMetric')
+vis_backends = [
+    dict(type='LocalVisBackend'),
+]
+visualizer = dict(
+    name='visualizer',
+    type='SegLocalVisualizer',
+    vis_backends=[
+        dict(type='LocalVisBackend'),
+    ])
+work_dir = './work_dirs/zero_mould_v2'