raycv/model/GAN/TAFG.py

import torch
import torch.nn as nn
from .base import ResidualBlock
from model.registry import MODEL
from torchvision.models import vgg19
from model.normalization import select_norm_layer


class VGG19StyleEncoder(nn.Module):
    def __init__(self, in_channels, base_channels=64, style_dim=512, padding_mode='reflect', norm_type="NONE",
                 vgg19_layers=(0, 5, 10, 19)):
        super().__init__()
        self.vgg19_layers = vgg19_layers
        self.vgg19 = vgg19(pretrained=True).features[:vgg19_layers[-1] + 1]
        self.vgg19.requires_grad_(False)

        norm_layer = select_norm_layer(norm_type)

        self.conv0 = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, kernel_size=7, stride=1, padding=3, padding_mode=padding_mode,
                      bias=True),
            norm_layer(base_channels),
            nn.ReLU(True),
        )
        self.conv = nn.ModuleList([
            nn.Sequential(
                nn.Conv2d(base_channels * (2 ** i), base_channels * (2 ** i), kernel_size=4, stride=2, padding=1,
                          padding_mode=padding_mode, bias=True),
                norm_layer(base_channels),
                nn.ReLU(True),
            ) for i in range(1, 4)
        ])
        self.pool = nn.AdaptiveAvgPool2d(1)
        self.conv1x1 = nn.Conv2d(base_channels * (2 ** 4), style_dim, kernel_size=1, stride=1, padding=0)

    def fixed_style_features(self, x):
        features = []
        for i in range(len(self.vgg19)):
            x = self.vgg19[i](x)
            if i in self.vgg19_layers:
                features.append(x)
        return features

    def forward(self, x):
        fsf = self.fixed_style_features(x)
        x = self.conv0(x)
        for i, l in enumerate(self.conv):
            x = l(torch.cat([x, fsf[i]], dim=1))
        x = self.pool(torch.cat([x, fsf[-1]], dim=1))
        x = self.conv1x1(x)
        return x.view(x.size(0), -1)


class ContentEncoder(nn.Module):
    def __init__(self, in_channels, base_channels=64, num_blocks=8, padding_mode='reflect', norm_type="IN"):
        super().__init__()
        norm_layer = select_norm_layer(norm_type)

        self.start_conv = nn.Sequential(
            nn.Conv2d(in_channels, base_channels, kernel_size=7, stride=1, padding_mode=padding_mode, padding=3,
                      bias=True),
            norm_layer(num_features=base_channels),
            nn.ReLU(inplace=True)
        )

        # down sampling
        submodules = []
        num_down_sampling = 2
        for i in range(num_down_sampling):
            multiple = 2 ** i
            submodules += [
                nn.Conv2d(in_channels=base_channels * multiple, out_channels=base_channels * multiple * 2,
                          kernel_size=4, stride=2, padding=1, bias=True),
                norm_layer(num_features=base_channels * multiple * 2),
                nn.ReLU(inplace=True)
            ]
        self.encoder = nn.Sequential(*submodules)
        res_block_channels = num_down_sampling ** 2 * base_channels
        self.resnet = nn.Sequential(
            *[ResidualBlock(res_block_channels, padding_mode, norm_type, use_bias=True) for _ in range(num_blocks)])

    def forward(self, x):
        x = self.start_conv(x)
        x = self.encoder(x)
        x = self.resnet(x)
        return x


class Decoder(nn.Module):
    def __init__(self, out_channels, base_channels=64, num_blocks=4, num_down_sampling=2, padding_mode='reflect',
                 norm_type="LN"):
        super(Decoder, self).__init__()
        norm_layer = select_norm_layer(norm_type)
        use_bias = norm_type == "IN"

        res_block_channels = (2 ** 2) * base_channels

        self.resnet = nn.Sequential(
            *[ResidualBlock(res_block_channels, padding_mode, norm_type, use_bias=True) for _ in range(num_blocks)])

        # up sampling
        submodules = []
        for i in range(num_down_sampling):
            multiple = 2 ** (num_down_sampling - i)
            submodules += [
                nn.Upsample(scale_factor=2),
                nn.Conv2d(base_channels * multiple, base_channels * multiple // 2, kernel_size=5, stride=1,
                          padding=2, padding_mode=padding_mode, bias=use_bias),
                norm_layer(num_features=base_channels * multiple // 2),
                nn.ReLU(inplace=True),
            ]
        self.decoder = nn.Sequential(*submodules)
        self.end_conv = nn.Sequential(
            nn.Conv2d(base_channels, out_channels, kernel_size=7, padding=3, padding_mode=padding_mode),
            nn.Tanh()
        )

    def forward(self, x):
        x = self.resnet(x)
        x = self.decoder(x)
        x = self.end_conv(x)
        return x


class Fusion(nn.Module):
    def __init__(self, in_features, out_features, base_features, n_blocks, norm_type="NONE"):
        super().__init__()
        norm_layer = select_norm_layer(norm_type)
        self.start_fc = nn.Sequential(
            nn.Linear(in_features, base_features),
            norm_layer(base_features),
            nn.ReLU(True),
        )
        self.fcs = nn.Sequential(*[
            nn.Sequential(
                nn.Linear(base_features, base_features),
                norm_layer(base_features),
                nn.ReLU(True),
            ) for _ in range(n_blocks - 2)
        ])
        self.end_fc = nn.Sequential(
            nn.Linear(base_features, out_features),
        )

    def forward(self, x):
        x = self.start_fc(x)
        x = self.fcs(x)
        return self.end_fc(x)


class StyleGenerator(nn.Module):
    def __init__(self, style_in_channels, style_dim=512, num_blocks=8, base_channels=64, padding_mode="reflect"):
        super().__init__()
        self.num_blocks = num_blocks
        self.style_encoder = VGG19StyleEncoder(
            style_in_channels, base_channels, style_dim=style_dim, padding_mode=padding_mode, norm_type="NONE")
        self.fc = nn.Sequential(
            nn.Linear(style_dim, style_dim),
            nn.ReLU(True),
        )
        res_block_channels = 2 ** 2 * base_channels
        self.fusion = Fusion(style_dim, num_blocks * 2 * res_block_channels * 2, base_features=256, n_blocks=3,
                             norm_type="NONE")

    def forward(self, x):
        styles = self.fusion(self.fc(self.style_encoder(x)))
        return styles


@MODEL.register_module("TAFG-Generator")
class Generator(nn.Module):
    def __init__(self, style_in_channels, content_in_channels=3, out_channels=3, style_dim=512, num_blocks=8,
                 base_channels=64, padding_mode="reflect"):
        super(Generator, self).__init__()
        self.num_blocks = num_blocks
        self.style_encoders = nn.ModuleDict({
            "a": StyleGenerator(style_in_channels, style_dim=style_dim, num_blocks=num_blocks,
                                base_channels=base_channels, padding_mode=padding_mode),
            "b": StyleGenerator(style_in_channels, style_dim=style_dim, num_blocks=num_blocks,
                                base_channels=base_channels, padding_mode=padding_mode),
        })
        self.content_encoder = ContentEncoder(content_in_channels, base_channels, num_blocks=num_blocks,
                                              padding_mode=padding_mode, norm_type="IN")
        res_block_channels = 2 ** 2 * base_channels
        self.adain_resnet_a = nn.ModuleList([
            ResidualBlock(res_block_channels, padding_mode, "AdaIN", use_bias=True) for _ in range(num_blocks)
        ])
        self.adain_resnet_b = nn.ModuleList([
            ResidualBlock(res_block_channels, padding_mode, "AdaIN", use_bias=True) for _ in range(num_blocks)
        ])

        self.decoders = nn.ModuleDict({
            "a": Decoder(out_channels, base_channels, norm_type="LN", num_blocks=0, padding_mode=padding_mode),
            "b": Decoder(out_channels, base_channels, norm_type="LN", num_blocks=0, padding_mode=padding_mode)
        })

    def forward(self, content_img, style_img, which_decoder: str = "a"):
        x = self.content_encoder(content_img)
        styles = self.style_encoders[which_decoder](style_img)
        styles = torch.chunk(styles, self.num_blocks * 2, dim=1)
        resnet = self.adain_resnet_a if which_decoder == "a" else self.adain_resnet_b
        for i, ar in enumerate(resnet):
            ar.norm1.set_style(styles[2 * i])
            ar.norm2.set_style(styles[2 * i + 1])
            x = ar(x)
        return self.decoders[which_decoder](x)


@MODEL.register_module("TAFG-Discriminator")
class Discriminator(nn.Module):
    def __init__(self, in_channels=3, base_channels=64, num_down_sampling=2, num_blocks=3, norm_type="IN",
                 padding_mode="reflect"):
        super(Discriminator, self).__init__()

        norm_layer = select_norm_layer(norm_type)
        use_bias = norm_type == "IN"

        sequence = [nn.Sequential(
            nn.Conv2d(in_channels, base_channels, kernel_size=7, stride=1, padding_mode=padding_mode, padding=3,
                      bias=use_bias),
            norm_layer(num_features=base_channels),
            nn.ReLU(inplace=True)
        )]
        # stacked intermediate layers,
        # gradually increasing the number of filters
        multiple_now = 1
        for n in range(1, num_down_sampling + 1):
            multiple_prev = multiple_now
            multiple_now = min(2 ** n, 4)
            sequence += [
                nn.Conv2d(base_channels * multiple_prev, base_channels * multiple_now, kernel_size=3,
                          padding=1, stride=2, bias=use_bias),
                norm_layer(base_channels * multiple_now),
                nn.LeakyReLU(0.2, inplace=True)
            ]
        for _ in range(num_blocks):
            sequence.append(ResidualBlock(base_channels * multiple_now, padding_mode, norm_type))
        self.model = nn.Sequential(*sequence)

    def forward(self, x):
        return self.model(x)