WHY IS BINARY SEGMENTATION SO DIFFICULT? #3753

Mayolov · 2024-08-06T00:40:15Z

Hi

Im trying to do some binary segmentation and am receiving values that I should on validation and test data. After 3 iterations it says that the IOU is perfect. when it shouldn't be. its just black and white masks with shape (512,512). Im following this git repo for mae_unet https://github.com/implus/mae_segmentation.

I cant figure out how im supposed to make it work.

Test data:

+----------------------+-------+-------+
| Class                | IoU   | Acc   |
+----------------------+-------+-------+
| obj of Interest | 100.0 | 100.0 |
| Background           | 0.0   | nan   |
+----------------------+-------+-------+
Summary:

+--------+------+-------+-------+
| Scope  | mIoU | mAcc  | aAcc  |
+--------+------+-------+-------+
| global | 50.0 | 100.0 | 100.0 |
+--------+------+-------+-------+

validation:

+----------------------+-------+-------+
| Class                | IoU   | Acc   |
+----------------------+-------+-------+
| obj of Interest | 100.0 | 100.0 |
| Background           | 0.0   | nan   |
+----------------------+-------+-------+
Summary:

+--------+------+-------+-------+
| Scope  | mIoU | mAcc  | aAcc  |
+--------+------+-------+-------+
| global | 50.0 | 100.0 | 100.0 |
+--------+------+-------+-------+

Custom Dataloader:

from .custom import CustomDataset
from .builder import DATASETS
import os.path as osp
    
@DATASETS.register_module()
class CustomBinarySegDataset(CustomDataset):
    CLASSES = ('Particle of Interest', 'Background', )
    PALETTE = [ [1],[0],]

    def __init__(self, **kwargs):
        super(CustomBinarySegDataset, self).__init__(
            img_suffix='.png',
            seg_map_suffix='_mask.png',
            reduce_zero_label=False,
            **kwargs
        )
        assert osp.exists(self.img_dir)

config:

norm_cfg = dict(type='SyncBN', requires_grad=True)
model = dict(
    type='EncoderDecoder',
    pretrained='/vast/home/mayolo/mae_git/mae/output_dir/checkpoint-799.pth',
    backbone=dict(
        type='MAE',
        patch_size=16,
        embed_dim=768,
        depth=12,
        num_heads=12,
        mlp_ratio=4,
        qkv_bias=True,
        use_abs_pos_emb=True,
        use_rel_pos_bias=True,
        img_size=512,
        init_values=1.0,
        drop_path_rate=0.1,
        out_indices=[3, 5, 7, 11]),
    decode_head=dict(
        type='UPerHead',
        in_channels=[768, 768, 768, 768],
        in_index=[0, 1, 2, 3],
        pool_scales=(1, 2, 3, 6),
        channels=768,
        dropout_ratio=0.1,
        num_classes=2,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        loss_decode=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0)),
    auxiliary_head=dict(
        type='FCNHead',
        in_channels=768,
        in_index=2,
        channels=256,
        num_convs=1,
        concat_input=False,
        dropout_ratio=0.1,
        num_classes=2,
        norm_cfg=dict(type='SyncBN', requires_grad=True),
        align_corners=False,
        loss_decode=dict(
            type='CrossEntropyLoss', use_sigmoid=False, loss_weight=0.4)),
    train_cfg=dict(),
    test_cfg=dict(mode='slide', crop_size=(512, 512), stride=(341, 341)))
dataset_type = 'CustomBinarySegDataset'
data_root = '/vast/home/mayolo/512x512_Seg_Aug_images/base'
img_norm_cfg = dict(mean=[0, 0, 0], std=[254, 254, 254], to_rgb=True)
crop_size = (512, 512)
train_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(reduce_zero_label=False, type='LoadAnnotations'),
    dict(type='ConvertToGrayScaleMask'),
    dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
    dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=1),
    dict(type='RandomFlip', prob=0.5),
    dict(type='PhotoMetricDistortion'),
    dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
    dict(type='DefaultFormatBundle'),
    dict(type='Collect', keys=['img', 'gt_semantic_seg'])
]
test_pipeline = [
    dict(type='LoadImageFromFile'),
    dict(
        type='MultiScaleFlipAug',
        img_scale=(2048, 512),
        flip=False,
        transforms=[
            dict(type='Resize', keep_ratio=True),
            dict(type='RandomFlip'),
            dict(type='ConvertToGrayScaleMask'),
            dict(type='ImageToTensor', keys=['img']),
            dict(type='Collect', keys=['img'])
        ])
]
data = dict(
    samples_per_gpu=2,
    workers_per_gpu=2,
    train=dict(
        type='CustomBinarySegDataset',
        data_root='/vast/home/mayolo/512x512_Seg_Aug_images/base',
        img_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/images/train',
        ann_dir=
        '/vast/home/mayolo/512x512_Seg_Aug_images/base/annotations/train',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(reduce_zero_label=False, type='LoadAnnotations'),
            dict(type='ConvertToGrayScaleMask'),
            dict(type='Resize', img_scale=(2048, 512), ratio_range=(0.5, 2.0)),
            dict(type='RandomCrop', crop_size=(512, 512), cat_max_ratio=1),
            dict(type='RandomFlip', prob=0.5),
            dict(type='PhotoMetricDistortion'),
            dict(type='Pad', size=(512, 512), pad_val=0, seg_pad_val=255),
            dict(type='DefaultFormatBundle'),
            dict(type='Collect', keys=['img', 'gt_semantic_seg'])
        ]),
    val=dict(
        type='CustomBinarySegDataset',
        data_root='/vast/home/mayolo/512x512_Seg_Aug_images/base',
        img_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/images/val',
        ann_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/annotations/val',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(2048, 512),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip'),
                    dict(type='ConvertToGrayScaleMask'),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ]),
    test=dict(
        type='CustomBinarySegDataset',
        data_root='/vast/home/mayolo/512x512_Seg_Aug_images/base',
        img_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/images/val',
        ann_dir='/vast/home/mayolo/512x512_Seg_Aug_images/base/annotations/val',
        pipeline=[
            dict(type='LoadImageFromFile'),
            dict(
                type='MultiScaleFlipAug',
                img_scale=(2048, 512),
                flip=False,
                transforms=[
                    dict(type='Resize', keep_ratio=True),
                    dict(type='RandomFlip'),
                    dict(type='ConvertToGrayScaleMask'),
                    dict(type='ImageToTensor', keys=['img']),
                    dict(type='Collect', keys=['img'])
                ])
        ]))
log_config = dict(
    interval=50, hooks=[dict(type='TextLoggerHook', by_epoch=False)])
dist_params = dict(backend='nccl')
log_level = 'INFO'
load_from = None
resume_from = None
workflow = [('train', 1)]
cudnn_benchmark = True
optimizer = dict(
    type='AdamW',
    lr=0.0001,
    betas=(0.9, 0.999),
    weight_decay=0.01,
    constructor='LayerDecayOptimizerConstructor',
    paramwise_cfg=dict(num_layers=12, layer_decay_rate=0.65))
optimizer_config = dict(
    type='DistOptimizerHook',
    update_interval=1,
    grad_clip=None,
    coalesce=True,
    bucket_size_mb=-1,
    use_fp16=True)
lr_config = dict(
    policy='poly',
    warmup='linear',
    warmup_iters=1500,
    warmup_ratio=1e-06,
    power=1.0,
    min_lr=0.0,
    by_epoch=False)
max_iters = 200000
runner = dict(type='IterBasedRunnerAmp', max_iters=200000)
checkpoint_config = dict(by_epoch=False, interval=2000)
evaluation = dict(interval=100, metric='mIoU')
fp16 = None
work_dir = './work_dirs/upernet_mae_base_12_512_slide_160k_ade20k'
gpu_ids = range(0, 1)

greyscale transform acts as a normalization function also:

@PIPELINES.register_module()
class ConvertToGrayScaleMask(object):
    """Converts a segmentation map to a binary mask rather than the color coded one."""
    def __init__(self):
        pass

    def __call__(self, results):
        """Call function to convert seg map to binary mask."""
        for key in results.get("seg_fields", []):
            if len(results[key].shape) == 3 and results[key].shape[2] == 3:
                results[key] = cv2.cvtColor(results[key], cv2.COLOR_BGR2GRAY)
                results[key] = cv2.normalize(results[key], None, alpha=0, beta=255, norm_type=cv2.NORM_MINMAX)
            results[key] = np.where(results[key] > 0, 1, 0).astype(np.float32)#allows for test.py to work
        return results

    def __repr__(self):
        return self.__class__.__name__

The text was updated successfully, but these errors were encountered:

dongxinyu1030 · 2024-08-13T07:04:48Z

try to use smaller learning rate and warmup

0xD4rky · 2024-08-19T11:40:04Z

the main thing you can do is introduce dice loss along with IoU metric and then optimize your model based on results of both the losses.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

WHY IS BINARY SEGMENTATION SO DIFFICULT? #3753

WHY IS BINARY SEGMENTATION SO DIFFICULT? #3753

Mayolov commented Aug 6, 2024

dongxinyu1030 commented Aug 13, 2024

0xD4rky commented Aug 19, 2024

WHY IS BINARY SEGMENTATION SO DIFFICULT? #3753

WHY IS BINARY SEGMENTATION SO DIFFICULT? #3753

Comments

Mayolov commented Aug 6, 2024

dongxinyu1030 commented Aug 13, 2024

0xD4rky commented Aug 19, 2024