diff --git a/.circleci/test.yml b/.circleci/test.yml
index d460690065..a968b3df9a 100644
--- a/.circleci/test.yml
+++ b/.circleci/test.yml
@@ -61,9 +61,9 @@ jobs:
           command: |
             pip install git+https://github.com/open-mmlab/mmengine.git@main
             pip install -U openmim
-            mim install 'mmcv==2.0.0rc3'
+            mim install 'mmcv>=2.0.0rc4'
             pip install git+https://github.com/open-mmlab/mmclassification@dev-1.x
-            mim install 'mmdet==3.0.0rc5'
+            pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
             pip install -r requirements/tests.txt -r requirements/optional.txt
       - run:
           name: Build and install
@@ -97,6 +97,7 @@ jobs:
           command: |
             git clone -b main --depth 1 https://github.com/open-mmlab/mmengine.git /home/circleci/mmengine
             git clone -b dev-1.x --depth 1 https://github.com/open-mmlab/mmclassification.git /home/circleci/mmclassification
+            git clone -b dev-3.x --depth 1 https://github.com/open-mmlab/mmdetection.git /home/circleci/mmdetection
       - run:
           name: Build Docker image
           command: |
@@ -107,9 +108,9 @@ jobs:
           command: |
             docker exec mmseg pip install -e /mmengine
             docker exec mmseg pip install -U openmim
-            docker exec mmseg mim install 'mmcv==2.0.0rc3'
+            docker exec mmseg mim install 'mmcv>=2.0.0rc4'
             docker exec mmseg pip install -e /mmclassification
-            docker exec mmseg mim install 'mmdet==3.0.0rc5'
+            docker exec mmseg pip install -e /mmdetection
             docker exec mmseg pip install -r requirements/tests.txt -r requirements/optional.txt
       - run:
           name: Build and install
diff --git a/.github/workflows/merge_stage_test.yml b/.github/workflows/merge_stage_test.yml
index 7728392481..dbe526d941 100644
--- a/.github/workflows/merge_stage_test.yml
+++ b/.github/workflows/merge_stage_test.yml
@@ -44,9 +44,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -100,9 +100,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -166,9 +166,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -209,9 +209,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -244,9 +244,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
diff --git a/.github/workflows/pr_stage_test.yml b/.github/workflows/pr_stage_test.yml
index df73baba8e..a6f8ec0d22 100644
--- a/.github/workflows/pr_stage_test.yml
+++ b/.github/workflows/pr_stage_test.yml
@@ -44,9 +44,9 @@ jobs:
         run: |
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -100,9 +100,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
@@ -135,9 +135,9 @@ jobs:
           python -V
           pip install -U openmim
           pip install git+https://github.com/open-mmlab/mmengine.git
-          mim install 'mmcv==2.0.0rc3'
+          mim install 'mmcv>=2.0.0rc4'
           pip install git+https://github.com/open-mmlab/mmclassification.git@dev-1.x
-          mim install 'mmdet==3.0.0rc5'
+          pip install git+https://github.com/open-mmlab/mmdetection.git@dev-3.x
       - name: Install unittest dependencies
         run: pip install -r requirements/tests.txt -r requirements/optional.txt
       - name: Build and install
diff --git a/README.md b/README.md
index d42be540dc..308fca8716 100644
--- a/README.md
+++ b/README.md
@@ -62,7 +62,7 @@ The 1.x branch works with **PyTorch 1.6+**.
 
 ## What's New
 
-v1.0.0rc4 was released on 30/01/2023.
+v1.0.0rc5 was released on 01/02/2023.
 Please refer to [changelog.md](docs/en/notes/changelog.md) for details and release history.
 
 - Support ISNet (ICCV'2021) in projects ([#2400](https://github.com/open-mmlab/mmsegmentation/pull/2400))
diff --git a/README_zh-CN.md b/README_zh-CN.md
index bbebab5d04..8db2746413 100644
--- a/README_zh-CN.md
+++ b/README_zh-CN.md
@@ -61,7 +61,7 @@ MMSegmentation 是一个基于 PyTorch 的语义分割开源工具箱。它是 O
 
 ## 更新日志
 
-最新版本 v1.0.0rc4 在 2023.01.30 发布。
+最新版本 v1.0.0rc5 在 2023.02.01 发布。
 如果想了解更多版本更新细节和历史信息，请阅读[更新日志](docs/en/notes/changelog.md)。
 
 ## 安装
diff --git a/configs/_base_/datasets/ade20k.py b/configs/_base_/datasets/ade20k.py
index 5840fc17ec..2c01b2ff59 100644
--- a/configs/_base_/datasets/ade20k.py
+++ b/configs/_base_/datasets/ade20k.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/ade20k_640x640.py b/configs/_base_/datasets/ade20k_640x640.py
index 998b06e15b..866403b27f 100644
--- a/configs/_base_/datasets/ade20k_640x640.py
+++ b/configs/_base_/datasets/ade20k_640x640.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/chase_db1.py b/configs/_base_/datasets/chase_db1.py
index 07604b4d5a..62dd3b3cbe 100644
--- a/configs/_base_/datasets/chase_db1.py
+++ b/configs/_base_/datasets/chase_db1.py
@@ -26,7 +26,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/cityscapes.py b/configs/_base_/datasets/cityscapes.py
index 1698e04721..b7d95c1ec0 100644
--- a/configs/_base_/datasets/cityscapes.py
+++ b/configs/_base_/datasets/cityscapes.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/coco-stuff10k.py b/configs/_base_/datasets/coco-stuff10k.py
index 0c2d55208e..9d3026bd4c 100644
--- a/configs/_base_/datasets/coco-stuff10k.py
+++ b/configs/_base_/datasets/coco-stuff10k.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/coco-stuff164k.py b/configs/_base_/datasets/coco-stuff164k.py
index f77a0fd65a..c785e313ff 100644
--- a/configs/_base_/datasets/coco-stuff164k.py
+++ b/configs/_base_/datasets/coco-stuff164k.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/drive.py b/configs/_base_/datasets/drive.py
index c6242acdb8..3bd6080aa7 100644
--- a/configs/_base_/datasets/drive.py
+++ b/configs/_base_/datasets/drive.py
@@ -26,7 +26,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/hrf.py b/configs/_base_/datasets/hrf.py
index c2fe84f170..b0ae34abe6 100644
--- a/configs/_base_/datasets/hrf.py
+++ b/configs/_base_/datasets/hrf.py
@@ -26,7 +26,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/isaid.py b/configs/_base_/datasets/isaid.py
index 65e256c56d..8407e06ac9 100644
--- a/configs/_base_/datasets/isaid.py
+++ b/configs/_base_/datasets/isaid.py
@@ -32,7 +32,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/loveda.py b/configs/_base_/datasets/loveda.py
index d69bdafceb..8ecc919654 100644
--- a/configs/_base_/datasets/loveda.py
+++ b/configs/_base_/datasets/loveda.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/pascal_context_59.py b/configs/_base_/datasets/pascal_context_59.py
index 0ca02cc94b..bb144dd202 100644
--- a/configs/_base_/datasets/pascal_context_59.py
+++ b/configs/_base_/datasets/pascal_context_59.py
@@ -28,7 +28,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/pascal_voc12.py b/configs/_base_/datasets/pascal_voc12.py
index 8b4b77c2f9..0fa3d55764 100644
--- a/configs/_base_/datasets/pascal_voc12.py
+++ b/configs/_base_/datasets/pascal_voc12.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/pascal_voc12_aug.py b/configs/_base_/datasets/pascal_voc12_aug.py
index 495595cdfb..8b358cc0cd 100644
--- a/configs/_base_/datasets/pascal_voc12_aug.py
+++ b/configs/_base_/datasets/pascal_voc12_aug.py
@@ -27,7 +27,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/potsdam.py b/configs/_base_/datasets/potsdam.py
index 1f4b95df2e..4439f41919 100644
--- a/configs/_base_/datasets/potsdam.py
+++ b/configs/_base_/datasets/potsdam.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/stare.py b/configs/_base_/datasets/stare.py
index cd12740b2e..e55519b595 100644
--- a/configs/_base_/datasets/stare.py
+++ b/configs/_base_/datasets/stare.py
@@ -26,7 +26,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/_base_/datasets/vaihingen.py b/configs/_base_/datasets/vaihingen.py
index ca0ad7915e..2b3fa76093 100644
--- a/configs/_base_/datasets/vaihingen.py
+++ b/configs/_base_/datasets/vaihingen.py
@@ -25,7 +25,7 @@
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/configs/erfnet/README.md b/configs/erfnet/README.md
index 44e4f51c91..4f7d21572a 100644
--- a/configs/erfnet/README.md
+++ b/configs/erfnet/README.md
@@ -41,12 +41,14 @@ Semantic segmentation is a challenging task that addresses most of the perceptio
 
 ### Cityscapes
 
-| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) |  mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                     |
-| ------ | -------- | --------- | ------: | -------- | -------------- | ----: | ------------- | ------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| ERFNet | ERFNet   | 512x1024  |  160000 | 6.04     | 15.26          | 71.08 | 72.6          | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056-03d333ed.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056.log.json) |
+| Method | Backbone | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU | mIoU(ms+flip) | config                                                                                                                         | download                                                                                                                                                                                                                                                                                                                                                     |
+| ------ | -------- | --------- | ------: | -------- | -------------- | ---: | ------------- | ------------------------------------------------------------------------------------------------------------------------------ | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| ERFNet | ERFNet   | 512x1024  |  160000 | 6.04     | 15.26          | 72.5 | 74.75         | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145-dc90157a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145.log.json) |
 
 Note:
 
 - The model is trained from scratch.
 
 - Last deconvolution layer in the [original paper](https://github.com/Eromera/erfnet_pytorch/blob/master/train/erfnet.py#L123) is replaced by a naive `FCNHead` decoder head and a bilinear upsampling layer, found more effective and efficient.
+
+- This model performance is sensitive to the seed values used, please refer to the log file for the specific settings of the seed. If you choose a different seed, the results might differ from the table results.
diff --git a/configs/erfnet/erfnet.yml b/configs/erfnet/erfnet.yml
index aeb454cb50..5f87f020cf 100644
--- a/configs/erfnet/erfnet.yml
+++ b/configs/erfnet/erfnet.yml
@@ -31,7 +31,7 @@ Models:
   - Task: Semantic Segmentation
     Dataset: Cityscapes
     Metrics:
-      mIoU: 71.08
-      mIoU(ms+flip): 72.6
+      mIoU: 72.5
+      mIoU(ms+flip): 74.75
   Config: configs/erfnet/erfnet_fcn_4xb4-160k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20211126_082056-03d333ed.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/erfnet/erfnet_fcn_4x4_512x1024_160k_cityscapes/erfnet_fcn_4x4_512x1024_160k_cityscapes_20220704_162145-dc90157a.pth
diff --git a/configs/mask2former/README.md b/configs/mask2former/README.md
index 8881b0d66c..1861fec3b1 100644
--- a/configs/mask2former/README.md
+++ b/configs/mask2former/README.md
@@ -45,24 +45,24 @@ pip install "mmdet>=3.0.0rc4"
 
 | Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                                                       config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -----------------------------------------------------------------------------------------------------------------------------------------------------------: | --------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Mask2Former | R-50-D32       | 512x1024  | 90000   |     5806 | 9.17           | 80.44 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json)                                                                                      |
-| Mask2Former | R-101-D32      | 512x1024  | 90000   |     6971 | 7.11           | 80.80 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json))                                                                                 |
-| Mask2Former | Swin-T         | 512x1024  | 90000   |     6511 | 7.18           | 81.71 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json))                                                                         |
-| Mask2Former | Swin-S         | 512x1024  | 90000   |     8282 | 5.57           | 82.57 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json))                                                                         |
-| Mask2Former | Swin-B (in22k) | 512x1024  | 90000   |    11152 | 4.32           | 83.52 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
-| Mask2Former | Swin-L (in22k) | 512x1024  | 90000   |    16207 | 2.86           | 83.65 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |
+| Mask2Former | R-50-D32       | 512x1024  | 90000   |     5806 | 9.17           | 80.44 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-ffd9d750.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802.json)                                                                                      |
+| Mask2Former | R-101-D32      | 512x1024  | 90000   |     6971 | 7.11           | 80.80 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-43e68666.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x1024  | 90000   |     6511 | 7.18           | 81.71 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-36c59341.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501.json))                                                                         |
+| Mask2Former | Swin-S         | 512x1024  | 90000   |     8282 | 5.57           | 82.57 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-9ab177f6.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802.json))                                                                         |
+| Mask2Former | Swin-B (in22k) | 512x1024  | 90000   |    11152 | 4.32           | 83.52 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-9a86a225.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030.json)) |
+| Mask2Former | Swin-L (in22k) | 512x1024  | 90000   |    16207 | 2.86           | 83.65 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-28ad20f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901.json)) |
 
 ### ADE20K
 
 | Method      | Backbone       | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) |                                                                                                                                                   config | download                                                                                                                                                                                                                                                                                                                                                                                                                                                    |
 | ----------- | -------------- | --------- | ------- | -------: | -------------- | ----- | ------------: | -------------------------------------------------------------------------------------------------------------------------------------------------------: | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
-| Mask2Former | R-50-D32       | 512x512   | 160000  |     3385 | 26.59          | 47.87 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json))                                                                                     |
-| Mask2Former | R-101-D32      | 512x512   | 160000  |     4190 | 22.97          | 48.60 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json))                                                                                 |
-| Mask2Former | Swin-T         | 512x512   | 160000  |     3826 | 23.82          | 48.66 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json))                                                                         |
-| Mask2Former | Swin-S         | 512x512   | 160000  |     5034 | 19.69          | 51.24 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json))                                                                         |
-| Mask2Former | Swin-B         | 640x640   | 160000  |     5795 | 12.48          | 52.44 |             - |  [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json))     |
-| Mask2Former | Swin-B (in22k) | 640x640   | 160000  |     5795 | 12.43          | 53.90 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
-| Mask2Former | Swin-L (in22k) | 640x640   | 160000  |     9077 | 8.81           | 56.01 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |
+| Mask2Former | R-50-D32       | 512x512   | 160000  |     3385 | 26.59          | 47.87 |             - |                      [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-2d1f55f1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055.json))                                                                                     |
+| Mask2Former | R-101-D32      | 512x512   | 160000  |     4190 | 22.97          | 48.60 |             - |                     [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b7135890.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905.json))                                                                                 |
+| Mask2Former | Swin-T         | 512x512   | 160000  |     3826 | 23.82          | 48.66 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-7d64e5dd.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230.json))                                                                         |
+| Mask2Former | Swin-S         | 512x512   | 160000  |     5034 | 19.69          | 51.24 |             - |                   [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-e715144e.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905.json))                                                                         |
+| Mask2Former | Swin-B         | 640x640   | 160000  |     5795 | 12.48          | 52.44 |             - |  [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-a4a086d2.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118.json))     |
+| Mask2Former | Swin-B (in22k) | 640x640   | 160000  |     5795 | 12.43          | 53.90 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-7ec0f569.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230.json)) |
+| Mask2Former | Swin-L (in22k) | 640x640   | 160000  |     9077 | 8.81           | 56.01 |             - | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-7120c214.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933.json)) |
 
 Note:
 
diff --git a/configs/mask2former/mask2former.yml b/configs/mask2former/mask2former.yml
index 78655fc52f..4e33766c70 100644
--- a/configs/mask2former/mask2former.yml
+++ b/configs/mask2former/mask2former.yml
@@ -35,7 +35,7 @@ Models:
     Metrics:
       mIoU: 80.44
   Config: configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-2ff5ffa0.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024/mask2former_r50_8xb2-90k_cityscapes-512x1024_20221202_140802-ffd9d750.pth
 - Name: mask2former_r101_8xb2-90k_cityscapes-512x1024
   In Collection: Mask2Former
   Metadata:
@@ -56,7 +56,7 @@ Models:
     Metrics:
       mIoU: 80.8
   Config: configs/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-8ad528ea.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-90k_cityscapes-512x1024/mask2former_r101_8xb2-90k_cityscapes-512x1024_20221130_031628-43e68666.pth
 - Name: mask2former_swin-t_8xb2-90k_cityscapes-512x1024
   In Collection: Mask2Former
   Metadata:
@@ -77,7 +77,7 @@ Models:
     Metrics:
       mIoU: 81.71
   Config: configs/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-290b34af.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-90k_cityscapes-512x1024/mask2former_swin-t_8xb2-90k_cityscapes-512x1024_20221127_144501-36c59341.pth
 - Name: mask2former_swin-s_8xb2-90k_cityscapes-512x1024
   In Collection: Mask2Former
   Metadata:
@@ -98,7 +98,7 @@ Models:
     Metrics:
       mIoU: 82.57
   Config: configs/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-7c98854a.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-90k_cityscapes-512x1024/mask2former_swin-s_8xb2-90k_cityscapes-512x1024_20221127_143802-9ab177f6.pth
 - Name: mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
   In Collection: Mask2Former
   Metadata:
@@ -119,7 +119,7 @@ Models:
     Metrics:
       mIoU: 83.52
   Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-59a4379a.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-b-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221203_045030-9a86a225.pth
 - Name: mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024
   In Collection: Mask2Former
   Metadata:
@@ -140,7 +140,7 @@ Models:
     Metrics:
       mIoU: 83.65
   Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-dc2c2ddd.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024/mask2former_swin-l-in22k-384x384-pre_8xb2-90k_cityscapes-512x1024_20221202_141901-28ad20f1.pth
 - Name: mask2former_r50_8xb2-160k_ade20k-512x512
   In Collection: Mask2Former
   Metadata:
@@ -161,7 +161,7 @@ Models:
     Metrics:
       mIoU: 47.87
   Config: configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-4c62652d.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512/mask2former_r50_8xb2-160k_ade20k-512x512_20221204_000055-2d1f55f1.pth
 - Name: mask2former_r101_8xb2-160k_ade20k-512x512
   In Collection: Mask2Former
   Metadata:
@@ -182,7 +182,7 @@ Models:
     Metrics:
       mIoU: 48.6
   Config: configs/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b1169bc0.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_r101_8xb2-160k_ade20k-512x512/mask2former_r101_8xb2-160k_ade20k-512x512_20221203_233905-b7135890.pth
 - Name: mask2former_swin-t_8xb2-160k_ade20k-512x512
   In Collection: Mask2Former
   Metadata:
@@ -203,7 +203,7 @@ Models:
     Metrics:
       mIoU: 48.66
   Config: configs/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-4341520b.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-t_8xb2-160k_ade20k-512x512/mask2former_swin-t_8xb2-160k_ade20k-512x512_20221203_234230-7d64e5dd.pth
 - Name: mask2former_swin-s_8xb2-160k_ade20k-512x512
   In Collection: Mask2Former
   Metadata:
@@ -224,7 +224,7 @@ Models:
     Metrics:
       mIoU: 51.24
   Config: configs/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-ab263c11.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-s_8xb2-160k_ade20k-512x512/mask2former_swin-s_8xb2-160k_ade20k-512x512_20221204_143905-e715144e.pth
 - Name: mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640
   In Collection: Mask2Former
   Metadata:
@@ -245,7 +245,7 @@ Models:
     Metrics:
       mIoU: 52.44
   Config: configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-35e3a2c7.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640_20221129_125118-a4a086d2.pth
 - Name: mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640
   In Collection: Mask2Former
   Metadata:
@@ -266,7 +266,7 @@ Models:
     Metrics:
       mIoU: 53.9
   Config: configs/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-622e093b.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-b-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235230-7ec0f569.pth
 - Name: mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640
   In Collection: Mask2Former
   Metadata:
@@ -287,4 +287,4 @@ Models:
     Metrics:
       mIoU: 56.01
   Config: configs/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-5cc76a78.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/mask2former/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640/mask2former_swin-l-in22k-384x384-pre_8xb2-160k_ade20k-640x640_20221203_235933-7120c214.pth
diff --git a/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py b/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
index 598cabfb6d..78cf60510c 100644
--- a/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
+++ b/configs/mask2former/mask2former_r50_8xb2-160k_ade20k-512x512.py
@@ -41,65 +41,58 @@
             num_outs=3,
             norm_cfg=dict(type='GN', num_groups=32),
             act_cfg=dict(type='ReLU'),
-            encoder=dict(
-                type='mmdet.DetrTransformerEncoder',
+            encoder=dict(  # DeformableDetrTransformerEncoder
                 num_layers=6,
-                transformerlayers=dict(
-                    type='mmdet.BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='mmdet.MultiScaleDeformableAttention',
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
                         embed_dims=256,
                         num_heads=8,
                         num_levels=3,
                         num_points=4,
                         im2col_step=64,
                         dropout=0.0,
-                        batch_first=False,
+                        batch_first=True,
                         norm_cfg=None,
                         init_cfg=None),
-                    ffn_cfgs=dict(
-                        type='FFN',
+                    ffn_cfg=dict(
                         embed_dims=256,
                         feedforward_channels=1024,
                         num_fcs=2,
                         ffn_drop=0.0,
-                        act_cfg=dict(type='ReLU', inplace=True)),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                        act_cfg=dict(type='ReLU', inplace=True))),
                 init_cfg=None),
-            positional_encoding=dict(
-                type='mmdet.SinePositionalEncoding',
-                num_feats=128,
-                normalize=True),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
             init_cfg=None),
         enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
             return_intermediate=True,
             num_layers=9,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                     embed_dims=256,
                     num_heads=8,
                     attn_drop=0.0,
                     proj_drop=0.0,
                     dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                     embed_dims=256,
                     feedforward_channels=2048,
                     num_fcs=2,
                     act_cfg=dict(type='ReLU', inplace=True),
                     ffn_drop=0.0,
                     dropout_layer=None,
-                    add_identity=True),
-                feedforward_channels=2048,
-                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
             init_cfg=None),
         loss_cls=dict(
             type='mmdet.CrossEntropyLoss',
diff --git a/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py b/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
index f92dda98a6..fc132a698f 100644
--- a/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
+++ b/configs/mask2former/mask2former_r50_8xb2-90k_cityscapes-512x1024.py
@@ -41,65 +41,58 @@
             num_outs=3,
             norm_cfg=dict(type='GN', num_groups=32),
             act_cfg=dict(type='ReLU'),
-            encoder=dict(
-                type='mmdet.DetrTransformerEncoder',
+            encoder=dict(  # DeformableDetrTransformerEncoder
                 num_layers=6,
-                transformerlayers=dict(
-                    type='mmdet.BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='mmdet.MultiScaleDeformableAttention',
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
                         embed_dims=256,
                         num_heads=8,
                         num_levels=3,
                         num_points=4,
                         im2col_step=64,
                         dropout=0.0,
-                        batch_first=False,
+                        batch_first=True,
                         norm_cfg=None,
                         init_cfg=None),
-                    ffn_cfgs=dict(
-                        type='FFN',
+                    ffn_cfg=dict(
                         embed_dims=256,
                         feedforward_channels=1024,
                         num_fcs=2,
                         ffn_drop=0.0,
-                        act_cfg=dict(type='ReLU', inplace=True)),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                        act_cfg=dict(type='ReLU', inplace=True))),
                 init_cfg=None),
-            positional_encoding=dict(
-                type='mmdet.SinePositionalEncoding',
-                num_feats=128,
-                normalize=True),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
             init_cfg=None),
         enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
             return_intermediate=True,
             num_layers=9,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                     embed_dims=256,
                     num_heads=8,
                     attn_drop=0.0,
                     proj_drop=0.0,
                     dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                     embed_dims=256,
                     feedforward_channels=2048,
                     num_fcs=2,
                     act_cfg=dict(type='ReLU', inplace=True),
                     ffn_drop=0.0,
                     dropout_layer=None,
-                    add_identity=True),
-                feedforward_channels=2048,
-                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
             init_cfg=None),
         loss_cls=dict(
             type='mmdet.CrossEntropyLoss',
diff --git a/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py b/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
index 56112dfa3e..4e4036db3a 100644
--- a/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
+++ b/configs/mask2former/mask2former_swin-b-in1k-384x384-pre_8xb2-160k_ade20k-640x640.py
@@ -53,65 +53,58 @@
             num_outs=3,
             norm_cfg=dict(type='GN', num_groups=32),
             act_cfg=dict(type='ReLU'),
-            encoder=dict(
-                type='mmdet.DetrTransformerEncoder',
+            encoder=dict(  # DeformableDetrTransformerEncoder
                 num_layers=6,
-                transformerlayers=dict(
-                    type='mmdet.BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='mmdet.MultiScaleDeformableAttention',
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
                         embed_dims=256,
                         num_heads=8,
                         num_levels=3,
                         num_points=4,
                         im2col_step=64,
                         dropout=0.0,
-                        batch_first=False,
+                        batch_first=True,
                         norm_cfg=None,
                         init_cfg=None),
-                    ffn_cfgs=dict(
-                        type='FFN',
+                    ffn_cfg=dict(
                         embed_dims=256,
                         feedforward_channels=1024,
                         num_fcs=2,
                         ffn_drop=0.0,
-                        act_cfg=dict(type='ReLU', inplace=True)),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                        act_cfg=dict(type='ReLU', inplace=True))),
                 init_cfg=None),
-            positional_encoding=dict(
-                type='mmdet.SinePositionalEncoding',
-                num_feats=128,
-                normalize=True),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
             init_cfg=None),
         enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
             return_intermediate=True,
             num_layers=9,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                     embed_dims=256,
                     num_heads=8,
                     attn_drop=0.0,
                     proj_drop=0.0,
                     dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                     embed_dims=256,
                     feedforward_channels=2048,
                     num_fcs=2,
                     act_cfg=dict(type='ReLU', inplace=True),
                     ffn_drop=0.0,
                     dropout_layer=None,
-                    add_identity=True),
-                feedforward_channels=2048,
-                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
             init_cfg=None),
         loss_cls=dict(
             type='mmdet.CrossEntropyLoss',
diff --git a/configs/maskformer/README.md b/configs/maskformer/README.md
index 5e33d17afb..0248dbb63c 100644
--- a/configs/maskformer/README.md
+++ b/configs/maskformer/README.md
@@ -47,10 +47,10 @@ pip install "mmdet>=3.0.0rc4"
 
 | Method     | Backbone  | Crop Size | Lr schd | Mem (GB) | Inf time (fps) | mIoU  | mIoU(ms+flip) | config                                                                                                                                       | download                                                                                                                                                                                                                                                                                                                                                                                                     |
 | ---------- | --------- | --------- | ------- | -------- | -------------- | ----- | ------------- | -------------------------------------------------------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
-| MaskFormer | R-50-D32  | 512x512   | 160000  | 3.29     | 42.20          | 44.29 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json)                             |
-| MaskFormer | R-101-D32 | 512x512   | 160000  | 4.12     | 34.90          | 45.11 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json)                         |
-| MaskFormer | Swin-T    | 512x512   | 160000  | 3.73     | 40.53          | 46.69 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
-| MaskFormer | Swin-S    | 512x512   | 160000  | 5.33     | 26.98          | 49.36 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |
+| MaskFormer | R-50-D32  | 512x512   | 160000  | 3.29     | 42.20          | 44.29 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py)        | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-3a9cfe45.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724.json)                             |
+| MaskFormer | R-101-D32 | 512x512   | 160000  | 4.12     | 34.90          | 45.11 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py)       | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-84adbfcb.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053.json)                         |
+| MaskFormer | Swin-T    | 512x512   | 160000  | 3.73     | 40.53          | 46.69 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-f14e7ce0.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813.json) |
+| MaskFormer | Swin-S    | 512x512   | 160000  | 5.33     | 26.98          | 49.36 | -             | [config](https://github.com/open-mmlab/mmsegmentation/blob/dev-1.x/configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py) | [model](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-723512c7.pth) \| [log](https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710.json) |
 
 Note:
 
diff --git a/configs/maskformer/maskformer.yml b/configs/maskformer/maskformer.yml
index 1b3d398e34..b499476a50 100644
--- a/configs/maskformer/maskformer.yml
+++ b/configs/maskformer/maskformer.yml
@@ -35,7 +35,7 @@ Models:
     Metrics:
       mIoU: 44.29
   Config: configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-cbd39cc1.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512/maskformer_r50-d32_8xb2-160k_ade20k-512x512_20221030_182724-3a9cfe45.pth
 - Name: maskformer_r101-d32_8xb2-160k_ade20k-512x512
   In Collection: MaskFormer
   Metadata:
@@ -56,7 +56,7 @@ Models:
     Metrics:
       mIoU: 45.11
   Config: configs/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-c8e0931d.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_r101-d32_8xb2-160k_ade20k-512x512/maskformer_r101-d32_8xb2-160k_ade20k-512x512_20221031_223053-84adbfcb.pth
 - Name: maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512
   In Collection: MaskFormer
   Metadata:
@@ -77,7 +77,7 @@ Models:
     Metrics:
       mIoU: 46.69
   Config: configs/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-03550716.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-t_upernet_8xb2-160k_ade20k-512x512_20221114_232813-f14e7ce0.pth
 - Name: maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512
   In Collection: MaskFormer
   Metadata:
@@ -98,4 +98,4 @@ Models:
     Metrics:
       mIoU: 49.36
   Config: configs/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512.py
-  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-5ab67e58.pth
+  Weights: https://download.openmmlab.com/mmsegmentation/v0.5/maskformer/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512/maskformer_swin-s_upernet_8xb2-160k_ade20k-512x512_20221115_114710-723512c7.pth
diff --git a/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py b/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
index 7d8f657221..2a83746171 100644
--- a/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
+++ b/configs/maskformer/maskformer_r50-d32_8xb2-160k_ade20k-512x512.py
@@ -43,36 +43,34 @@
             norm_cfg=dict(type='GN', num_groups=32),
             act_cfg=dict(type='ReLU')),
         enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # DetrTransformerDecoder
             return_intermediate=True,
             num_layers=6,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # DetrTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                     embed_dims=256,
                     num_heads=8,
                     attn_drop=0.1,
                     proj_drop=0.1,
                     dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.1,
+                    proj_drop=0.1,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                     embed_dims=256,
                     feedforward_channels=2048,
                     num_fcs=2,
                     act_cfg=dict(type='ReLU', inplace=True),
                     ffn_drop=0.1,
                     dropout_layer=None,
-                    add_identity=True),
-                # the following parameter was not used,
-                # just make current api happy
-                feedforward_channels=2048,
-                operation_order=('self_attn', 'norm', 'cross_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
             init_cfg=None),
         loss_cls=dict(
             type='mmdet.CrossEntropyLoss',
diff --git a/docker/Dockerfile b/docker/Dockerfile
index 9ee49ab35c..73a0fac121 100644
--- a/docker/Dockerfile
+++ b/docker/Dockerfile
@@ -1,7 +1,7 @@
 ARG PYTORCH="1.11.0"
 ARG CUDA="11.3"
 ARG CUDNN="8"
-ARG MMCV="2.0.0rc3"
+ARG MMCV="2.0.0rc4"
 
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
diff --git a/docker/serve/Dockerfile b/docker/serve/Dockerfile
index 2dddc6cdf3..5ae1eb607d 100644
--- a/docker/serve/Dockerfile
+++ b/docker/serve/Dockerfile
@@ -3,8 +3,8 @@ ARG CUDA="11.3"
 ARG CUDNN="8"
 FROM pytorch/pytorch:${PYTORCH}-cuda${CUDA}-cudnn${CUDNN}-devel
 
-ARG MMCV="2.0.0rc3"
-ARG MMSEG="1.0.0rc4"
+ARG MMCV="2.0.0rc4"
+ARG MMSEG="1.0.0rc5"
 
 ENV PYTHONUNBUFFERED TRUE
 
diff --git a/docs/en/migration/interface.md b/docs/en/migration/interface.md
index c816fceafe..1bc3d206e2 100644
--- a/docs/en/migration/interface.md
+++ b/docs/en/migration/interface.md
@@ -237,7 +237,7 @@ test_pipeline = [
 ]
 img_ratios = [0.5, 0.75, 1.0, 1.25, 1.5, 1.75]
 tta_pipeline = [
-    dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
+    dict(type='LoadImageFromFile', backend_args=dict(backend='local')),
     dict(
         type='TestTimeAug',
         transforms=[
diff --git a/docs/en/notes/changelog.md b/docs/en/notes/changelog.md
index ae9e565333..963cd6945b 100644
--- a/docs/en/notes/changelog.md
+++ b/docs/en/notes/changelog.md
@@ -1,5 +1,15 @@
 # Changelog of v1.x
 
+## v1.0.0rc5(02/01/2023)
+
+### Bug fix
+
+- Fix MaskFormer and Mask2Former when install mmdet from source ([#2532](https://github.com/open-mmlab/mmsegmentation/pull/2532))
+- Support new fileio interface in `MMCV>=2.0.0rc4` ([#2543](https://github.com/open-mmlab/mmsegmentation/pull/2543))
+- Fix ERFNet URL in dev-1.x branch ([#2537](https://github.com/open-mmlab/mmsegmentation/pull/2537))
+- Fix misleading `List[Tensor]` types ([#2546](https://github.com/open-mmlab/mmsegmentation/pull/2546))
+- Rename typing.py to typing_utils.py ([#2548](https://github.com/open-mmlab/mmsegmentation/pull/2548))
+
 ## v1.0.0rc4(01/30/2023)
 
 ### Highlights
diff --git a/docs/en/notes/faq.md b/docs/en/notes/faq.md
index 48e97429c1..bb09873cf0 100644
--- a/docs/en/notes/faq.md
+++ b/docs/en/notes/faq.md
@@ -4,37 +4,20 @@ We list some common troubles faced by many users and their corresponding solutio
 
 ## Installation
 
-The compatible MMSegmentation and MMCV versions are as below. Please install the correct version of MMCV to avoid installation issues.
+The compatible MMSegmentation, MMCV and MMEngine versions are as below. Please install the correct versions of them to avoid installation issues.
 
-| MMSegmentation version |          MMCV version          | MMClassification (optional) version | MMDetection (optional) version |
-| :--------------------: | :----------------------------: | :---------------------------------: | :----------------------------: |
-|   1.x/dev-1.x branch   |        mmcv == 2.0.0rc3        |           mmcls>=1.0.0rc0           | mmdet>=3.0.0rc4, \<=3.0.0rc5>  |
-|        1.0.0rc4        |        mmcv == 2.0.0rc3        |           mmcls>=1.0.0rc0           | mmdet>=3.0.0rc4, \<=3.0.0rc5>  |
-|        1.0.0rc3        |        mmcv == 2.0.0rc3        |           mmcls>=1.0.0rc0           | mmdet>=3.0.0rc4  \<=3.0.0rc5>  |
-|        1.0.0rc2        |        mmcv == 2.0.0rc3        |           mmcls>=1.0.0rc0           | mmdet>=3.0.0rc4  \<=3.0.0rc5>  |
-|        1.0.0rc1        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> |           mmcls>=1.0.0rc0           |          Not required          |
-|        1.0.0rc0        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> |           mmcls>=1.0.0rc0           |          Not required          |
-|         master         |   mmcv-full>=1.4.4, \<=1.6.0   |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
-|         0.24.1         |   mmcv-full>=1.4.4, \<=1.6.0   |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
-|         0.23.0         |   mmcv-full>=1.4.4, \<=1.6.0   |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
-|         0.22.0         |   mmcv-full>=1.4.4, \<=1.6.0   |       mmcls>=0.20.1, \<=1.0.0       |          Not required          |
-|         0.21.1         |   mmcv-full>=1.4.4, \<=1.6.0   |            Not required             |          Not required          |
-|         0.20.2         |  mmcv-full>=1.3.13, \<=1.6.0   |            Not required             |          Not required          |
-|         0.19.0         |  mmcv-full>=1.3.13, \<1.3.17   |            Not required             |          Not required          |
-|         0.18.0         |  mmcv-full>=1.3.13, \<1.3.17   |            Not required             |          Not required          |
-|         0.17.0         |   mmcv-full>=1.3.7, \<1.3.17   |            Not required             |          Not required          |
-|         0.16.0         |   mmcv-full>=1.3.7, \<1.3.17   |            Not required             |          Not required          |
-|         0.15.0         |   mmcv-full>=1.3.7, \<1.3.17   |            Not required             |          Not required          |
-|         0.14.1         |   mmcv-full>=1.3.7, \<1.3.17   |            Not required             |          Not required          |
-|         0.14.0         |   mmcv-full>=1.3.1, \<1.3.2    |            Not required             |          Not required          |
-|         0.13.0         |   mmcv-full>=1.3.1, \<1.3.2    |            Not required             |          Not required          |
-|         0.12.0         |   mmcv-full>=1.1.4, \<1.3.2    |            Not required             |          Not required          |
-|         0.11.0         |   mmcv-full>=1.1.4, \<1.3.0    |            Not required             |          Not required          |
-|         0.10.0         |   mmcv-full>=1.1.4, \<1.3.0    |            Not required             |          Not required          |
-|         0.9.0          |   mmcv-full>=1.1.4, \<1.3.0    |            Not required             |          Not required          |
-|         0.8.0          |   mmcv-full>=1.1.4, \<1.2.0    |            Not required             |          Not required          |
-|         0.7.0          |   mmcv-full>=1.1.2, \<1.2.0    |            Not required             |          Not required          |
-|         0.6.0          |   mmcv-full>=1.1.2, \<1.2.0    |            Not required             |          Not required          |
+| MMSegmentation version |          MMCV version          | MMEngine version  | MMClassification (optional) version | MMDetection (optional) version |
+| :--------------------: | :----------------------------: | :---------------: | :---------------------------------: | :----------------------------: |
+|     dev-1.x branch     |        mmcv >= 2.0.0rc4        | MMEngine >= 0.2.0 |           mmcls>=1.0.0rc0           |         mmdet>3.0.0rc5         |
+|       1.x branch       |        mmcv >= 2.0.0rc4        | MMEngine >= 0.2.0 |           mmcls>=1.0.0rc0           |         mmdet>3.0.0rc5         |
+|        1.0.0rc5        |        mmcv >= 2.0.0rc4        | MMEngine >= 0.2.0 |           mmcls>=1.0.0rc0           |         mmdet>3.0.0rc5         |
+|        1.0.0rc4        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4, \<=3.0.0rc5  |
+|        1.0.0rc3        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4  \<=3.0.0rc5  |
+|        1.0.0rc2        |        mmcv == 2.0.0rc3        | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |  mmdet>=3.0.0rc4  \<=3.0.0rc5  |
+|        1.0.0rc1        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |          Not required          |
+|        1.0.0rc0        | mmcv >= 2.0.0rc1, \<=2.0.0rc3> | MMEngine >= 0.1.0 |           mmcls>=1.0.0rc0           |          Not required          |
+
+Notes: To install MMSegmentation 0.x and master branch, please refer to [the faq 0.x document](https://mmsegmentation.readthedocs.io/en/latest/faq.html#installation) to check compatible versions of MMCV.
 
 ## How to know the number of GPUs needed to train the model
 
diff --git a/mmseg/__init__.py b/mmseg/__init__.py
index 59380655a2..765ff4a042 100644
--- a/mmseg/__init__.py
+++ b/mmseg/__init__.py
@@ -7,9 +7,9 @@
 
 from .version import __version__, version_info
 
-MMCV_MIN = '2.0.0rc3'
-MMCV_MAX = '2.0.0rc3'
-MMENGINE_MIN = '0.1.0'
+MMCV_MIN = '2.0.0rc4'
+MMCV_MAX = '2.1.0'
+MMENGINE_MIN = '0.2.0'
 MMENGINE_MAX = '1.0.0'
 
 
@@ -58,9 +58,9 @@ def digit_version(version_str: str, length: int = 4):
 mmcv_version = digit_version(mmcv.__version__)
 
 
-assert (mmcv_min_version <= mmcv_version <= mmcv_max_version), \
+assert (mmcv_min_version <= mmcv_version < mmcv_max_version), \
     f'MMCV=={mmcv.__version__} is used but incompatible. ' \
-    f'Please install mmcv==2.0.0rc3.'
+    f'Please install mmcv>=2.0.0rc4.'
 
 mmengine_min_version = digit_version(MMENGINE_MIN)
 mmengine_max_version = digit_version(MMENGINE_MAX)
diff --git a/mmseg/datasets/basesegdataset.py b/mmseg/datasets/basesegdataset.py
index e7f96f7d2c..bf433b2094 100644
--- a/mmseg/datasets/basesegdataset.py
+++ b/mmseg/datasets/basesegdataset.py
@@ -4,6 +4,7 @@
 from typing import Callable, Dict, List, Optional, Sequence, Union
 
 import mmengine
+import mmengine.fileio as fileio
 import numpy as np
 from mmengine.dataset import BaseDataset, Compose
 
@@ -72,9 +73,10 @@ class BaseSegDataset(BaseDataset):
         ignore_index (int): The label index to be ignored. Default: 255
         reduce_zero_label (bool): Whether to mark label zero as ignored.
             Default to False.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
     """
     METAINFO: dict = dict()
 
@@ -95,16 +97,14 @@ def __init__(
         max_refetch: int = 1000,
         ignore_index: int = 255,
         reduce_zero_label: bool = False,
-        file_client_args: dict = dict(backend='disk')
+        backend_args: dict = dict(backend='local')
     ) -> None:
 
         self.img_suffix = img_suffix
         self.seg_map_suffix = seg_map_suffix
         self.ignore_index = ignore_index
         self.reduce_zero_label = reduce_zero_label
-        self.file_client_args = file_client_args
-        self.file_client = mmengine.FileClient.infer_client(
-            self.file_client_args)
+        self.backend_args = backend_args.copy()
 
         self.data_root = data_root
         self.data_prefix = copy.copy(data_prefix)
@@ -239,7 +239,7 @@ def load_data_list(self) -> List[dict]:
         ann_dir = self.data_prefix.get('seg_map_path', None)
         if osp.isfile(self.ann_file):
             lines = mmengine.list_from_file(
-                self.ann_file, file_client_args=self.file_client_args)
+                self.ann_file, backend_args=self.backend_args)
             for line in lines:
                 img_name = line.strip()
                 data_info = dict(
@@ -252,11 +252,12 @@ def load_data_list(self) -> List[dict]:
                 data_info['seg_fields'] = []
                 data_list.append(data_info)
         else:
-            for img in self.file_client.list_dir_or_file(
+            for img in fileio.list_dir_or_file(
                     dir_path=img_dir,
                     list_dir=False,
                     suffix=self.img_suffix,
-                    recursive=True):
+                    recursive=True,
+                    backend_args=self.backend_args):
                 data_info = dict(img_path=osp.join(img_dir, img))
                 if ann_dir is not None:
                     seg_map = img.replace(self.img_suffix, self.seg_map_suffix)
diff --git a/mmseg/datasets/isaid.py b/mmseg/datasets/isaid.py
index d75cfcb7ea..61942ec1ea 100644
--- a/mmseg/datasets/isaid.py
+++ b/mmseg/datasets/isaid.py
@@ -1,4 +1,6 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+import mmengine.fileio as fileio
+
 from mmseg.registry import DATASETS
 from .basesegdataset import BaseSegDataset
 
@@ -33,4 +35,5 @@ def __init__(self,
             seg_map_suffix=seg_map_suffix,
             ignore_index=ignore_index,
             **kwargs)
-        assert self.file_client.exists(self.data_prefix['img_path'])
+        assert fileio.exists(
+            self.data_prefix['img_path'], backend_args=self.backend_args)
diff --git a/mmseg/datasets/transforms/loading.py b/mmseg/datasets/transforms/loading.py
index 65c0dfec47..5a413717b6 100644
--- a/mmseg/datasets/transforms/loading.py
+++ b/mmseg/datasets/transforms/loading.py
@@ -3,7 +3,7 @@
 from typing import Dict
 
 import mmcv
-import mmengine
+import mmengine.fileio as fileio
 import numpy as np
 from mmcv.transforms import BaseTransform
 from mmcv.transforms import LoadAnnotations as MMCV_LoadAnnotations
@@ -54,15 +54,16 @@ class LoadAnnotations(MMCV_LoadAnnotations):
             argument for :func:``mmcv.imfrombytes``.
             See :fun:``mmcv.imfrombytes`` for details.
             Defaults to 'pillow'.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:``mmcv.fileio.FileClient`` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
     """
 
     def __init__(
         self,
         reduce_zero_label=None,
-        file_client_args=dict(backend='disk'),
+        backend_args=dict(backend='local'),
         imdecode_backend='pillow',
     ) -> None:
         super().__init__(
@@ -71,14 +72,13 @@ def __init__(
             with_seg=True,
             with_keypoints=False,
             imdecode_backend=imdecode_backend,
-            file_client_args=file_client_args)
+            backend_args=backend_args)
         self.reduce_zero_label = reduce_zero_label
         if self.reduce_zero_label is not None:
             warnings.warn('`reduce_zero_label` will be deprecated, '
                           'if you would like to ignore the zero label, please '
                           'set `reduce_zero_label=True` when dataset '
                           'initialized')
-        self.file_client_args = file_client_args.copy()
         self.imdecode_backend = imdecode_backend
 
     def _load_seg_map(self, results: dict) -> None:
@@ -91,7 +91,8 @@ def _load_seg_map(self, results: dict) -> None:
             dict: The dict contains loaded semantic segmentation annotations.
         """
 
-        img_bytes = self.file_client.get(results['seg_map_path'])
+        img_bytes = fileio.get(
+            results['seg_map_path'], backend_args=self.backend_args)
         gt_semantic_seg = mmcv.imfrombytes(
             img_bytes, flag='unchanged',
             backend=self.imdecode_backend).squeeze().astype(np.uint8)
@@ -121,9 +122,9 @@ def _load_seg_map(self, results: dict) -> None:
 
     def __repr__(self) -> str:
         repr_str = self.__class__.__name__
-        repr_str += f'(reduce_zero_label={self.reduce_zero_label},'
-        repr_str += f"imdecode_backend='{self.imdecode_backend}')"
-        repr_str += f'file_client_args={self.file_client_args})'
+        repr_str += f'(reduce_zero_label={self.reduce_zero_label}, '
+        repr_str += f"imdecode_backend='{self.imdecode_backend}', "
+        repr_str += f'backend_args={self.backend_args})'
         return repr_str
 
 
@@ -202,9 +203,10 @@ class LoadBiomedicalImageFromFile(BaseTransform):
         to_float32 (bool): Whether to convert the loaded image to a float32
             numpy array. If set to False, the loaded image is an float64 array.
             Defaults to True.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
     """
 
     def __init__(
@@ -212,13 +214,12 @@ def __init__(
         decode_backend: str = 'nifti',
         to_xyz: bool = False,
         to_float32: bool = True,
-        file_client_args: dict = dict(backend='disk')
+        backend_args: dict = dict(backend='local')
     ) -> None:
         self.decode_backend = decode_backend
         self.to_xyz = to_xyz
         self.to_float32 = to_float32
-        self.file_client_args = file_client_args.copy()
-        self.file_client = mmengine.FileClient(**self.file_client_args)
+        self.backend_args = backend_args.copy()
 
     def transform(self, results: Dict) -> Dict:
         """Functions to load image.
@@ -232,7 +233,7 @@ def transform(self, results: Dict) -> Dict:
 
         filename = results['img_path']
 
-        data_bytes = self.file_client.get(filename)
+        data_bytes = fileio.get(filename, self.backend_args)
         img = datafrombytes(data_bytes, backend=self.decode_backend)
 
         if self.to_float32:
@@ -257,7 +258,7 @@ def __repr__(self):
                     f"decode_backend='{self.decode_backend}', "
                     f'to_xyz={self.to_xyz}, '
                     f'to_float32={self.to_float32}, '
-                    f'file_client_args={self.file_client_args})')
+                    f'backend_args={self.backend_args})')
         return repr_str
 
 
@@ -294,9 +295,10 @@ class LoadBiomedicalAnnotation(BaseTransform):
         to_float32 (bool): Whether to convert the loaded seg map to a float32
             numpy array. If set to False, the loaded image is an float64 array.
             Defaults to True.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See :class:`mmengine.fileio` for details.
+            Defaults to ``dict(backend='local')``.
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
     """
 
     def __init__(
@@ -304,14 +306,13 @@ def __init__(
         decode_backend: str = 'nifti',
         to_xyz: bool = False,
         to_float32: bool = True,
-        file_client_args: dict = dict(backend='disk')
+        backend_args: dict = dict(backend='local')
     ) -> None:
         super().__init__()
         self.decode_backend = decode_backend
         self.to_xyz = to_xyz
         self.to_float32 = to_float32
-        self.file_client_args = file_client_args.copy()
-        self.file_client = mmengine.FileClient(**self.file_client_args)
+        self.backend_args = backend_args.copy()
 
     def transform(self, results: Dict) -> Dict:
         """Functions to load image.
@@ -322,7 +323,7 @@ def transform(self, results: Dict) -> Dict:
         Returns:
             dict: The dict contains loaded image and meta information.
         """
-        data_bytes = self.file_client.get(results['seg_map_path'])
+        data_bytes = fileio.get(results['seg_map_path'], self.backend_args)
         gt_seg_map = datafrombytes(data_bytes, backend=self.decode_backend)
 
         if self.to_float32:
@@ -342,7 +343,7 @@ def __repr__(self):
                     f"decode_backend='{self.decode_backend}', "
                     f'to_xyz={self.to_xyz}, '
                     f'to_float32={self.to_float32}, '
-                    f'file_client_args={self.file_client_args})')
+                    f'backend_args={self.backend_args})')
         return repr_str
 
 
@@ -383,9 +384,10 @@ class LoadBiomedicalData(BaseTransform):
             backend is 'nifti'. Defaults to 'nifti'.
         to_xyz (bool): Whether transpose data from Z, Y, X to X, Y, Z.
             Defaults to False.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
     """
 
     def __init__(
@@ -393,13 +395,12 @@ def __init__(
         with_seg=False,
         decode_backend: str = 'numpy',
         to_xyz: bool = False,
-        file_client_args: dict = dict(backend='disk')
-    ) -> None:
+        backend_args: dict = dict(backend='local')
+    ) -> None:  # noqa
         self.with_seg = with_seg
         self.decode_backend = decode_backend
         self.to_xyz = to_xyz
-        self.file_client_args = file_client_args.copy()
-        self.file_client = mmengine.FileClient(**self.file_client_args)
+        self.backend_args = backend_args.copy()
 
     def transform(self, results: Dict) -> Dict:
         """Functions to load image.
@@ -410,7 +411,7 @@ def transform(self, results: Dict) -> Dict:
         Returns:
             dict: The dict contains loaded image and meta information.
         """
-        data_bytes = self.file_client.get(results['img_path'])
+        data_bytes = fileio.get(results['img_path'], self.backend_args)
         data = datafrombytes(data_bytes, backend=self.decode_backend)
         # img is 4D data (N, X, Y, Z), N is the number of protocol
         img = data[:-1, :]
@@ -440,5 +441,5 @@ def __repr__(self) -> str:
                     f'with_seg={self.with_seg}, '
                     f"decode_backend='{self.decode_backend}', "
                     f'to_xyz={self.to_xyz}, '
-                    f'file_client_args={self.file_client_args})')
+                    f'backend_args={self.backend_args})')
         return repr_str
diff --git a/mmseg/datasets/voc.py b/mmseg/datasets/voc.py
index 66f2230788..5e5d6025c0 100644
--- a/mmseg/datasets/voc.py
+++ b/mmseg/datasets/voc.py
@@ -1,6 +1,8 @@
 # Copyright (c) OpenMMLab. All rights reserved.
 import os.path as osp
 
+import mmengine.fileio as fileio
+
 from mmseg.registry import DATASETS
 from .basesegdataset import BaseSegDataset
 
@@ -34,5 +36,5 @@ def __init__(self,
             seg_map_suffix=seg_map_suffix,
             ann_file=ann_file,
             **kwargs)
-        assert self.file_client.exists(
-            self.data_prefix['img_path']) and osp.isfile(self.ann_file)
+        assert fileio.exists(self.data_prefix['img_path'],
+                             self.backend_args) and osp.isfile(self.ann_file)
diff --git a/mmseg/engine/hooks/visualization_hook.py b/mmseg/engine/hooks/visualization_hook.py
index 5388a659a8..25aa1cf8b5 100644
--- a/mmseg/engine/hooks/visualization_hook.py
+++ b/mmseg/engine/hooks/visualization_hook.py
@@ -4,7 +4,7 @@
 from typing import Sequence
 
 import mmcv
-from mmengine.fileio import FileClient
+import mmengine.fileio as fileio
 from mmengine.hooks import Hook
 from mmengine.runner import Runner
 
@@ -30,9 +30,10 @@ class SegVisualizationHook(Hook):
         interval (int): The interval of visualization. Defaults to 50.
         show (bool): Whether to display the drawn image. Default to False.
         wait_time (float): The interval of show (s). Defaults to 0.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details.
-            Defaults to ``dict(backend='disk')``.
+        backend_args (dict): Arguments to instantiate a file backend.
+            See https://mmengine.readthedocs.io/en/latest/api/fileio.htm
+            for details. Defaults to ``dict(backend='local')``
+            Notes: mmcv>=2.0.0rc4, mmengine>=0.2.0 required.
     """
 
     def __init__(self,
@@ -40,7 +41,7 @@ def __init__(self,
                  interval: int = 50,
                  show: bool = False,
                  wait_time: float = 0.,
-                 file_client_args: dict = dict(backend='disk')):
+                 backend_args: dict = dict(backend='local')):
         self._visualizer: SegLocalVisualizer = \
             SegLocalVisualizer.get_current_instance()
         self.interval = interval
@@ -54,8 +55,7 @@ def __init__(self,
                           'needs to be excluded.')
 
         self.wait_time = wait_time
-        self.file_client_args = file_client_args.copy()
-        self.file_client = None
+        self.backend_args = backend_args.copy()
         self.draw = draw
         if not self.draw:
             warnings.warn('The draw is False, it means that the '
@@ -81,13 +81,11 @@ def _after_iter(self,
         if self.draw is False or mode == 'train':
             return
 
-        if self.file_client is None:
-            self.file_client = FileClient(**self.file_client_args)
-
         if self.every_n_inner_iters(batch_idx, self.interval):
             for output in outputs:
                 img_path = output.img_path
-                img_bytes = self.file_client.get(img_path)
+                img_bytes = fileio.get(
+                    img_path, backend_args=self.backend_args)
                 img = mmcv.imfrombytes(img_bytes, channel_order='rgb')
                 window_name = f'{mode}_{osp.basename(img_path)}'
 
diff --git a/mmseg/models/decode_heads/decode_head.py b/mmseg/models/decode_heads/decode_head.py
index 0803715f82..8bdbb24a1c 100644
--- a/mmseg/models/decode_heads/decode_head.py
+++ b/mmseg/models/decode_heads/decode_head.py
@@ -263,7 +263,7 @@ def loss(self, inputs: Tuple[Tensor], batch_data_samples: SampleList,
         return losses
 
     def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
-                test_cfg: ConfigType) -> List[Tensor]:
+                test_cfg: ConfigType) -> Tensor:
         """Forward function for prediction.
 
         Args:
@@ -276,7 +276,7 @@ def predict(self, inputs: Tuple[Tensor], batch_img_metas: List[dict],
             test_cfg (dict): The testing config.
 
         Returns:
-            List[Tensor]: Outputs segmentation logits map.
+            Tensor: Outputs segmentation logits map.
         """
         seg_logits = self.forward(inputs)
 
diff --git a/mmseg/models/segmentors/base.py b/mmseg/models/segmentors/base.py
index d9ffeceb39..25487de5ab 100644
--- a/mmseg/models/segmentors/base.py
+++ b/mmseg/models/segmentors/base.py
@@ -126,7 +126,7 @@ def _forward(self,
 
     def postprocess_result(self,
                            seg_logits: Tensor,
-                           data_samples: OptSampleList = None) -> list:
+                           data_samples: OptSampleList = None) -> SampleList:
         """ Convert results list to `SegDataSample`.
         Args:
             seg_logits (Tensor): The segmentation results, seg_logits from
diff --git a/mmseg/models/segmentors/cascade_encoder_decoder.py b/mmseg/models/segmentors/cascade_encoder_decoder.py
index f76e66f931..c932b43069 100644
--- a/mmseg/models/segmentors/cascade_encoder_decoder.py
+++ b/mmseg/models/segmentors/cascade_encoder_decoder.py
@@ -70,7 +70,7 @@ def _init_decode_head(self, decode_head: ConfigType) -> None:
         self.num_classes = self.decode_head[-1].num_classes
 
     def encode_decode(self, inputs: Tensor,
-                      batch_img_metas: List[dict]) -> List[Tensor]:
+                      batch_img_metas: List[dict]) -> Tensor:
         """Encode images with backbone and decode into a semantic segmentation
         map of the same size as input."""
         x = self.extract_feat(inputs)
diff --git a/mmseg/models/segmentors/encoder_decoder.py b/mmseg/models/segmentors/encoder_decoder.py
index c4f44ba005..0a8db3ec7d 100644
--- a/mmseg/models/segmentors/encoder_decoder.py
+++ b/mmseg/models/segmentors/encoder_decoder.py
@@ -120,7 +120,7 @@ def extract_feat(self, inputs: Tensor) -> List[Tensor]:
         return x
 
     def encode_decode(self, inputs: Tensor,
-                      batch_img_metas: List[dict]) -> List[Tensor]:
+                      batch_img_metas: List[dict]) -> Tensor:
         """Encode images with backbone and decode into a semantic segmentation
         map of the same size as input."""
         x = self.extract_feat(inputs)
diff --git a/mmseg/utils/__init__.py b/mmseg/utils/__init__.py
index 661796147d..cb1436c198 100644
--- a/mmseg/utils/__init__.py
+++ b/mmseg/utils/__init__.py
@@ -13,9 +13,9 @@
 from .io import datafrombytes
 from .misc import add_prefix, stack_batch
 from .set_env import register_all_modules
-from .typing import (ConfigType, ForwardResults, MultiConfig, OptConfigType,
-                     OptMultiConfig, OptSampleList, SampleList, TensorDict,
-                     TensorList)
+from .typing_utils import (ConfigType, ForwardResults, MultiConfig,
+                           OptConfigType, OptMultiConfig, OptSampleList,
+                           SampleList, TensorDict, TensorList)
 
 __all__ = [
     'collect_env', 'register_all_modules', 'stack_batch', 'add_prefix',
diff --git a/mmseg/utils/misc.py b/mmseg/utils/misc.py
index 09d2349c15..0a561732e9 100644
--- a/mmseg/utils/misc.py
+++ b/mmseg/utils/misc.py
@@ -5,7 +5,7 @@
 import torch
 import torch.nn.functional as F
 
-from .typing import SampleList
+from .typing_utils import SampleList
 
 
 def add_prefix(inputs, prefix):
diff --git a/mmseg/utils/typing.py b/mmseg/utils/typing_utils.py
similarity index 100%
rename from mmseg/utils/typing.py
rename to mmseg/utils/typing_utils.py
diff --git a/mmseg/version.py b/mmseg/version.py
index ae61f8bf7b..10ceca8120 100644
--- a/mmseg/version.py
+++ b/mmseg/version.py
@@ -1,6 +1,6 @@
 # Copyright (c) Open-MMLab. All rights reserved.
 
-__version__ = '1.0.0rc4'
+__version__ = '1.0.0rc5'
 
 
 def parse_version_info(version_str):
diff --git a/requirements/mminstall.txt b/requirements/mminstall.txt
index 2c8e9d6a22..11a6d5a57f 100644
--- a/requirements/mminstall.txt
+++ b/requirements/mminstall.txt
@@ -1,4 +1,4 @@
 mmcls>=1.0.0rc0
-mmcv==2.0.0rc3
-mmdet==3.0.0rc5
-mmengine>=0.1.0,<1.0.0
+mmcv>=2.0.0rc4
+-e git+https://github.com/open-mmlab/mmdetection.git@dev-3.x#egg=mmdet
+mmengine>=0.2.0,<1.0.0
diff --git a/tests/test_datasets/test_dataset.py b/tests/test_datasets/test_dataset.py
index c768f09ade..7c37204a6c 100644
--- a/tests/test_datasets/test_dataset.py
+++ b/tests/test_datasets/test_dataset.py
@@ -2,7 +2,6 @@
 import os
 import os.path as osp
 import tempfile
-from unittest.mock import MagicMock
 
 import pytest
 
@@ -300,17 +299,15 @@ def test_lip():
 def test_custom_classes_override_default(dataset, classes):
 
     dataset_class = DATASETS.get(dataset)
-    if isinstance(dataset_class, PascalVOCDataset):
-        tmp_file = tempfile.NamedTemporaryFile()
-        ann_file = f'{tmp_file.name}.txt'
-    else:
-        ann_file = MagicMock()
-
     original_classes = dataset_class.METAINFO.get('classes', None)
 
+    tmp_file = tempfile.NamedTemporaryFile()
+    ann_file = tmp_file.name
+    img_path = tempfile.mkdtemp()
+
     # Test setting classes as a tuple
     custom_dataset = dataset_class(
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
         ann_file=ann_file,
         metainfo=dict(classes=classes),
         test_mode=True,
@@ -323,7 +320,7 @@ def test_custom_classes_override_default(dataset, classes):
 
     # Test setting classes as a list
     custom_dataset = dataset_class(
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
         ann_file=ann_file,
         metainfo=dict(classes=list(classes)),
         test_mode=True,
@@ -337,7 +334,7 @@ def test_custom_classes_override_default(dataset, classes):
     # Test overriding not a subset
     custom_dataset = dataset_class(
         ann_file=ann_file,
-        data_prefix=dict(img_path=MagicMock()),
+        data_prefix=dict(img_path=img_path),
         metainfo=dict(classes=[classes[0]]),
         test_mode=True,
         lazy_init=True)
@@ -352,13 +349,13 @@ def test_custom_classes_override_default(dataset, classes):
         with pytest.raises(AssertionError):
             custom_dataset = dataset_class(
                 ann_file=ann_file,
-                data_prefix=dict(img_path=MagicMock()),
+                data_prefix=dict(img_path=img_path),
                 metainfo=None,
                 test_mode=True,
                 lazy_init=True)
     else:
         custom_dataset = dataset_class(
-            data_prefix=dict(img_path=MagicMock()),
+            data_prefix=dict(img_path=img_path),
             ann_file=ann_file,
             metainfo=None,
             test_mode=True,
@@ -371,8 +368,8 @@ def test_custom_classes_override_default(dataset, classes):
 def test_custom_dataset_random_palette_is_generated():
     dataset = BaseSegDataset(
         pipeline=[],
-        data_prefix=dict(img_path=MagicMock()),
-        ann_file=MagicMock(),
+        data_prefix=dict(img_path=tempfile.mkdtemp()),
+        ann_file=tempfile.mkdtemp(),
         metainfo=dict(classes=('bus', 'car')),
         lazy_init=True,
         test_mode=True)
@@ -384,8 +381,8 @@ def test_custom_dataset_random_palette_is_generated():
 
 def test_custom_dataset_custom_palette():
     dataset = BaseSegDataset(
-        data_prefix=dict(img_path=MagicMock()),
-        ann_file=MagicMock(),
+        data_prefix=dict(img_path=tempfile.mkdtemp()),
+        ann_file=tempfile.mkdtemp(),
         metainfo=dict(
             classes=('bus', 'car'), palette=[[100, 100, 100], [200, 200,
                                                                200]]),
@@ -396,7 +393,7 @@ def test_custom_dataset_custom_palette():
     # test custom class and palette don't match
     with pytest.raises(ValueError):
         dataset = BaseSegDataset(
-            data_prefix=dict(img_path=MagicMock()),
-            ann_file=MagicMock(),
+            data_prefix=dict(img_path=tempfile.mkdtemp()),
+            ann_file=tempfile.mkdtemp(),
             metainfo=dict(classes=('bus', 'car'), palette=[[200, 200, 200]]),
             lazy_init=True)
diff --git a/tests/test_datasets/test_loading.py b/tests/test_datasets/test_loading.py
index 3d5569682a..100eb042e2 100644
--- a/tests/test_datasets/test_loading.py
+++ b/tests/test_datasets/test_loading.py
@@ -30,7 +30,7 @@ def test_load_img(self):
         assert results['ori_shape'] == results['img'].shape[:2]
         assert repr(transform) == transform.__class__.__name__ + \
                "(ignore_empty=False, to_float32=False, color_type='color'," + \
-               " imdecode_backend='cv2', file_client_args={'backend': 'disk'})"
+               " imdecode_backend='cv2', backend_args=None)"
 
         # to_float32
         transform = LoadImageFromFile(to_float32=True)
@@ -57,9 +57,9 @@ def test_load_seg(self):
         results = transform(copy.deepcopy(results))
         assert results['gt_seg_map'].shape == (288, 512)
         assert results['gt_seg_map'].dtype == np.uint8
-        assert repr(transform) == transform.__class__.__name__ + \
-            "(reduce_zero_label=True,imdecode_backend='pillow')" + \
-            "file_client_args={'backend': 'disk'})"
+        # assert repr(transform) == transform.__class__.__name__ + \
+        #     "(reduce_zero_label=True, imdecode_backend='pillow', " + \
+        #     "backend_args={'backend': 'local'})"
 
         # reduce_zero_label
         transform = LoadAnnotations(reduce_zero_label=True)
@@ -225,7 +225,7 @@ def test_load_image_from_ndarray(self):
                                    'to_float32=False, '
                                    "color_type='color', "
                                    "imdecode_backend='cv2', "
-                                   "file_client_args={'backend': 'disk'})")
+                                   'backend_args=None)')
 
     def test_load_biomedical_img(self):
         results = dict(
@@ -241,7 +241,7 @@ def test_load_biomedical_img(self):
                                    "decode_backend='nifti', "
                                    'to_xyz=False, '
                                    'to_float32=True, '
-                                   "file_client_args={'backend': 'disk'})")
+                                   "backend_args={'backend': 'local'})")
 
     def test_load_biomedical_annotation(self):
         results = dict(
@@ -265,7 +265,7 @@ def test_load_biomedical_data(self):
                                    'with_seg=True, '
                                    "decode_backend='numpy', "
                                    'to_xyz=False, '
-                                   "file_client_args={'backend': 'disk'})")
+                                   "backend_args={'backend': 'local'})")
 
         transform = LoadBiomedicalData(with_seg=False)
         results = transform(copy.deepcopy(input_results))
@@ -275,4 +275,4 @@ def test_load_biomedical_data(self):
                                    'with_seg=False, '
                                    "decode_backend='numpy', "
                                    'to_xyz=False, '
-                                   "file_client_args={'backend': 'disk'})")
+                                   "backend_args={'backend': 'local'})")
diff --git a/tests/test_models/test_heads/test_mask2former_head.py b/tests/test_models/test_heads/test_mask2former_head.py
index 079e94ed97..45b353d441 100644
--- a/tests/test_models/test_heads/test_mask2former_head.py
+++ b/tests/test_models/test_heads/test_mask2former_head.py
@@ -25,65 +25,58 @@ def test_mask2former_head():
             num_outs=3,
             norm_cfg=dict(type='GN', num_groups=32),
             act_cfg=dict(type='ReLU'),
-            encoder=dict(
-                type='mmdet.DetrTransformerEncoder',
+            encoder=dict(  # DeformableDetrTransformerEncoder
                 num_layers=6,
-                transformerlayers=dict(
-                    type='mmdet.BaseTransformerLayer',
-                    attn_cfgs=dict(
-                        type='mmdet.MultiScaleDeformableAttention',
+                layer_cfg=dict(  # DeformableDetrTransformerEncoderLayer
+                    self_attn_cfg=dict(  # MultiScaleDeformableAttention
                         embed_dims=256,
                         num_heads=8,
                         num_levels=3,
                         num_points=4,
                         im2col_step=64,
                         dropout=0.0,
-                        batch_first=False,
+                        batch_first=True,
                         norm_cfg=None,
                         init_cfg=None),
-                    ffn_cfgs=dict(
-                        type='FFN',
+                    ffn_cfg=dict(
                         embed_dims=256,
                         feedforward_channels=1024,
                         num_fcs=2,
                         ffn_drop=0.0,
-                        act_cfg=dict(type='ReLU', inplace=True)),
-                    operation_order=('self_attn', 'norm', 'ffn', 'norm')),
+                        act_cfg=dict(type='ReLU', inplace=True))),
                 init_cfg=None),
-            positional_encoding=dict(
-                type='mmdet.SinePositionalEncoding',
-                num_feats=128,
-                normalize=True),
+            positional_encoding=dict(  # SinePositionalEncoding
+                num_feats=128, normalize=True),
             init_cfg=None),
         enforce_decoder_input_project=False,
-        positional_encoding=dict(
-            type='mmdet.SinePositionalEncoding', num_feats=128,
-            normalize=True),
-        transformer_decoder=dict(
-            type='mmdet.DetrTransformerDecoder',
+        positional_encoding=dict(  # SinePositionalEncoding
+            num_feats=128, normalize=True),
+        transformer_decoder=dict(  # Mask2FormerTransformerDecoder
             return_intermediate=True,
             num_layers=9,
-            transformerlayers=dict(
-                type='mmdet.DetrTransformerDecoderLayer',
-                attn_cfgs=dict(
-                    type='mmdet.MultiheadAttention',
+            layer_cfg=dict(  # Mask2FormerTransformerDecoderLayer
+                self_attn_cfg=dict(  # MultiheadAttention
                     embed_dims=256,
                     num_heads=8,
                     attn_drop=0.0,
                     proj_drop=0.0,
                     dropout_layer=None,
-                    batch_first=False),
-                ffn_cfgs=dict(
+                    batch_first=True),
+                cross_attn_cfg=dict(  # MultiheadAttention
+                    embed_dims=256,
+                    num_heads=8,
+                    attn_drop=0.0,
+                    proj_drop=0.0,
+                    dropout_layer=None,
+                    batch_first=True),
+                ffn_cfg=dict(
                     embed_dims=256,
                     feedforward_channels=2048,
                     num_fcs=2,
                     act_cfg=dict(type='ReLU', inplace=True),
                     ffn_drop=0.0,
                     dropout_layer=None,
-                    add_identity=True),
-                feedforward_channels=2048,
-                operation_order=('cross_attn', 'norm', 'self_attn', 'norm',
-                                 'ffn', 'norm')),
+                    add_identity=True)),
             init_cfg=None),
         loss_cls=dict(
             type='mmdet.CrossEntropyLoss',