raycv/model/image_translation/MUNIT.py

import torch
import torch.nn as nn

from model import MODEL
from model.base.module import Conv2dBlock, ResidualBlock, LinearBlock


def _get_down_sampling_sequence(in_channels, base_channels, num_conv, max_down_sampling_multiple=2,
                                padding_mode='reflect', activation_type="ReLU", norm_type="NONE"):
    sequence = [Conv2dBlock(
        in_channels, base_channels, kernel_size=7, stride=1, padding=3, padding_mode=padding_mode,
        activation_type=activation_type, norm_type=norm_type
    )]
    multiple_now = 1
    for i in range(1, num_conv + 1):
        multiple_prev = multiple_now
        multiple_now = min(2 ** i, 2 ** max_down_sampling_multiple)
        sequence.append(Conv2dBlock(
            multiple_prev * base_channels, multiple_now * base_channels,
            kernel_size=4, stride=2, padding=1, padding_mode=padding_mode,
            activation_type=activation_type, norm_type=norm_type
        ))
    return sequence, multiple_now * base_channels


class StyleEncoder(nn.Module):
    def __init__(self, in_channels, out_dim, num_conv, base_channels=64,
                 max_down_sampling_multiple=2, padding_mode='reflect', activation_type="ReLU", norm_type="NONE"):
        super().__init__()

        sequence, last_channels = _get_down_sampling_sequence(
            in_channels, base_channels, num_conv,
            max_down_sampling_multiple, padding_mode, activation_type, norm_type
        )
        sequence.append(nn.AdaptiveAvgPool2d(1))
        # conv1x1 works as fc when tensor's size is (batch_size, channels, 1, 1), keep same with origin code
        sequence.append(nn.Conv2d(last_channels, out_dim, kernel_size=1, stride=1, padding=0))
        self.sequence = nn.Sequential(*sequence)

    def forward(self, image):
        return self.sequence(image).view(image.size(0), -1)


class ContentEncoder(nn.Module):
    def __init__(self, in_channels, num_down_sampling, num_residual_blocks, base_channels=64,
                 max_down_sampling_multiple=2,
                 padding_mode='reflect', activation_type="ReLU", norm_type="IN"):
        super().__init__()

        sequence, last_channels = _get_down_sampling_sequence(
            in_channels, base_channels, num_down_sampling,
            max_down_sampling_multiple, padding_mode, activation_type, norm_type
        )

        sequence += [ResidualBlock(last_channels, last_channels, padding_mode, activation_type, norm_type) for _ in
                     range(num_residual_blocks)]
        self.sequence = nn.Sequential(*sequence)

    def forward(self, image):
        return self.sequence(image)


class Decoder(nn.Module):
    def __init__(self, in_channels, out_channels, num_up_sampling, num_residual_blocks,
                 res_norm_type="AdaIN", norm_type="LN", activation_type="ReLU", padding_mode='reflect'):
        super().__init__()
        self.residual_blocks = nn.ModuleList([
            ResidualBlock(in_channels, in_channels, padding_mode, activation_type, norm_type=res_norm_type)
            for _ in range(num_residual_blocks)
        ])

        sequence = list()
        channels = in_channels
        for i in range(num_up_sampling):
            sequence.append(nn.Sequential(
                nn.Upsample(scale_factor=2),
                Conv2dBlock(channels, channels // 2,
                            kernel_size=5, stride=1, padding=2, padding_mode=padding_mode,
                            activation_type=activation_type, norm_type=norm_type),
            ))
            channels = channels // 2
        sequence.append(Conv2dBlock(channels, out_channels,
                                    kernel_size=7, stride=1, padding=3, padding_mode="reflect",
                                    activation_type="Tanh", norm_type="NONE"))

        self.up_sequence = nn.Sequential(*sequence)

    def forward(self, x, style):
        as_param_style = torch.chunk(style, 2 * len(self.residual_blocks), dim=1)
        # set style for decoder
        for i, blk in enumerate(self.residual_blocks):
            blk.conv1.normalization.set_style(as_param_style[2 * i])
            blk.conv2.normalization.set_style(as_param_style[2 * i + 1])
            x = blk(x)
        return self.up_sequence(x)


class MLPFusion(nn.Module):
    def __init__(self, in_features, out_features, base_features, n_blocks, activation_type="ReLU", norm_type="NONE"):
        super().__init__()

        sequence = [LinearBlock(in_features, base_features, activation_type=activation_type, norm_type=norm_type)]
        sequence += [
            LinearBlock(base_features, base_features, activation_type=activation_type, norm_type=norm_type)
            for _ in range(n_blocks - 2)
        ]
        sequence.append(LinearBlock(base_features, out_features, activation_type=activation_type, norm_type=norm_type))
        self.sequence = nn.Sequential(*sequence)

    def forward(self, x):
        return self.sequence(x)


@MODEL.register_module("MUNIT-Generator")
class Generator(nn.Module):
    def __init__(self, in_channels, out_channels, base_channels=64, style_dim=8,
                 num_mlp_base_feature=256, num_mlp_blocks=3,
                 max_down_sampling_multiple=2, num_content_down_sampling=2, num_style_down_sampling=2,
                 encoder_num_residual_blocks=4, decoder_num_residual_blocks=4,
                 padding_mode='reflect', activation_type="ReLU"):
        super().__init__()
        self.content_encoder = ContentEncoder(
            in_channels, num_content_down_sampling, encoder_num_residual_blocks,
            base_channels, max_down_sampling_multiple,
            padding_mode, activation_type, norm_type="IN")

        self.style_encoder = StyleEncoder(in_channels, style_dim, num_style_down_sampling, base_channels,
                                          max_down_sampling_multiple, padding_mode, activation_type,
                                          norm_type="NONE")

        content_channels = base_channels * (2 ** max_down_sampling_multiple)

        self.fusion = MLPFusion(style_dim, decoder_num_residual_blocks * 2 * content_channels * 2,
                                num_mlp_base_feature, num_mlp_blocks, activation_type,
                                norm_type="NONE")

        self.decoder = Decoder(content_channels, out_channels, max_down_sampling_multiple, decoder_num_residual_blocks,
                               res_norm_type="AdaIN", norm_type="LN", activation_type=activation_type,
                               padding_mode=padding_mode)

    def encode(self, x):
        return self.content_encoder(x), self.style_encoder(x)

    def decode(self, content, style):
        self.decoder(content, self.fusion(style))

    def forward(self, x):
        content, style = self.encode(x)
        return self.decode(content, style)