import torch import torch.nn as nn from torchvision.models import vgg19 from model.normalization import select_norm_layer from model.registry import MODEL from .MUNIT import ContentEncoder, Fusion, Decoder, StyleEncoder from .base import ResBlock class VGG19StyleEncoder(nn.Module): def __init__(self, in_channels, base_channels=64, style_dim=512, padding_mode='reflect', norm_type="NONE", vgg19_layers=(0, 5, 10, 19), fix_vgg19=True): super().__init__() self.vgg19_layers = vgg19_layers self.vgg19 = vgg19(pretrained=True).features[:vgg19_layers[-1] + 1] self.vgg19.requires_grad_(not fix_vgg19) norm_layer = select_norm_layer(norm_type) self.conv0 = nn.Sequential( nn.Conv2d(in_channels, base_channels, kernel_size=7, stride=1, padding=3, padding_mode=padding_mode, bias=True), norm_layer(base_channels), nn.ReLU(True), ) self.conv = nn.ModuleList([ nn.Sequential( nn.Conv2d(base_channels * (2 ** i), base_channels * (2 ** i), kernel_size=4, stride=2, padding=1, padding_mode=padding_mode, bias=True), norm_layer(base_channels), nn.ReLU(True), ) for i in range(1, 4) ]) self.pool = nn.AdaptiveAvgPool2d(1) self.conv1x1 = nn.Conv2d(base_channels * (2 ** 4), style_dim, kernel_size=1, stride=1, padding=0) def fixed_style_features(self, x): features = [] for i in range(len(self.vgg19)): x = self.vgg19[i](x) if i in self.vgg19_layers: features.append(x) return features def forward(self, x): fsf = self.fixed_style_features(x) x = self.conv0(x) for i, l in enumerate(self.conv): x = l(torch.cat([x, fsf[i]], dim=1)) x = self.pool(torch.cat([x, fsf[-1]], dim=1)) x = self.conv1x1(x) return x.view(x.size(0), -1) @MODEL.register_module("TAFG-ResGenerator") class ResGenerator(nn.Module): def __init__(self, in_channels, out_channels=3, use_spectral_norm=False, num_res_blocks=8, base_channels=64): super().__init__() self.content_encoder = ContentEncoder(in_channels, 2, num_res_blocks=num_res_blocks, use_spectral_norm=use_spectral_norm) resnet_channels = 2 ** 2 * base_channels self.decoder = Decoder(resnet_channels, out_channels, 2, 0, use_spectral_norm, "IN", norm_type="LN", padding_mode="reflect") def forward(self, x): return self.decoder(self.content_encoder(x)) @MODEL.register_module("TAFG-SingleGenerator") class SingleGenerator(nn.Module): def __init__(self, style_in_channels, content_in_channels, out_channels=3, use_spectral_norm=False, style_encoder_type="StyleEncoder", num_style_conv=4, style_dim=512, num_adain_blocks=8, num_res_blocks=8, base_channels=64, padding_mode="reflect"): super().__init__() self.num_adain_blocks = num_adain_blocks if style_encoder_type == "StyleEncoder": self.style_encoder = StyleEncoder( style_in_channels, style_dim, num_style_conv, base_channels, use_spectral_norm, max_multiple=4, padding_mode=padding_mode, norm_type="NONE" ) elif style_encoder_type == "VGG19StyleEncoder": self.style_encoder = VGG19StyleEncoder( style_in_channels, base_channels, style_dim=style_dim, padding_mode=padding_mode, norm_type="NONE" ) else: raise NotImplemented(f"do not support {style_encoder_type}") resnet_channels = 2 ** 2 * base_channels self.style_converter = Fusion(style_dim, num_adain_blocks * 2 * resnet_channels * 2, base_features=256, n_blocks=3, norm_type="NONE") self.content_encoder = ContentEncoder(content_in_channels, 2, num_res_blocks=num_res_blocks, use_spectral_norm=use_spectral_norm) self.decoder = Decoder(resnet_channels, out_channels, 2, num_adain_blocks, use_spectral_norm, "AdaIN", norm_type="LN", padding_mode=padding_mode) def forward(self, content_img, style_img): content = self.content_encoder(content_img) style = self.style_encoder(style_img) as_param_style = torch.chunk(self.style_converter(style), self.num_adain_blocks * 2, dim=1) # set style for decoder for i, blk in enumerate(self.decoder.res_blocks): blk.conv1.normalization.set_style(as_param_style[2 * i]) blk.conv2.normalization.set_style(as_param_style[2 * i + 1]) return self.decoder(content) @MODEL.register_module("TAFG-Generator") class Generator(nn.Module): def __init__(self, style_in_channels, content_in_channels=3, out_channels=3, use_spectral_norm=False, style_encoder_type="StyleEncoder", num_style_conv=4, style_dim=512, num_adain_blocks=8, num_res_blocks=8, base_channels=64, padding_mode="reflect"): super(Generator, self).__init__() self.num_adain_blocks = num_adain_blocks if style_encoder_type == "StyleEncoder": self.style_encoders = nn.ModuleDict(dict( a=StyleEncoder(style_in_channels, style_dim, num_style_conv, base_channels, use_spectral_norm, max_multiple=4, padding_mode=padding_mode, norm_type="NONE"), b=StyleEncoder(style_in_channels, style_dim, num_style_conv, base_channels, use_spectral_norm, max_multiple=4, padding_mode=padding_mode, norm_type="NONE"), )) elif style_encoder_type == "VGG19StyleEncoder": self.style_encoders = nn.ModuleDict(dict( a=VGG19StyleEncoder(style_in_channels, base_channels, style_dim=style_dim, padding_mode=padding_mode, norm_type="NONE"), b=VGG19StyleEncoder(style_in_channels, base_channels, style_dim=style_dim, padding_mode=padding_mode, norm_type="NONE", fix_vgg19=False) )) else: raise NotImplemented(f"do not support {style_encoder_type}") resnet_channels = 2 ** 2 * base_channels self.style_converters = nn.ModuleDict(dict( a=Fusion(style_dim, num_adain_blocks * 2 * resnet_channels * 2, base_features=256, n_blocks=3, norm_type="NONE"), b=Fusion(style_dim, num_adain_blocks * 2 * resnet_channels * 2, base_features=256, n_blocks=3, norm_type="NONE"), )) self.content_encoders = nn.ModuleDict({ "a": ContentEncoder(content_in_channels, 2, num_res_blocks=0, use_spectral_norm=use_spectral_norm), "b": ContentEncoder(1, 2, num_res_blocks=0, use_spectral_norm=use_spectral_norm) }) self.content_resnet = nn.Sequential(*[ ResBlock(resnet_channels, use_spectral_norm, padding_mode, "IN") for _ in range(num_res_blocks) ]) self.decoders = nn.ModuleDict(dict( a=Decoder(resnet_channels, out_channels, 2, num_adain_blocks, use_spectral_norm, "AdaIN", norm_type="LN", padding_mode=padding_mode), b=Decoder(resnet_channels, out_channels, 2, num_adain_blocks, use_spectral_norm, "AdaIN", norm_type="LN", padding_mode=padding_mode), )) def encode(self, content_img, style_img, which_content, which_style): content = self.content_resnet(self.content_encoders[which_content](content_img)) style = self.style_encoders[which_style](style_img) return content, style def decode(self, content, style, which): decoder = self.decoders[which] as_param_style = torch.chunk(self.style_converters[which](style), self.num_adain_blocks * 2, dim=1) # set style for decoder for i, blk in enumerate(decoder.res_blocks): blk.conv1.normalization.set_style(as_param_style[2 * i]) blk.conv2.normalization.set_style(as_param_style[2 * i + 1]) return decoder(content) def forward(self, content_img, style_img, which_content, which_style): content, style = self.encode(content_img, style_img, which_content, which_style) return self.decode(content, style, which_style)