Spaces:
Runtime error
Runtime error
| # MIT License | |
| # Copyright (c) 2022 Intelligent Systems Lab Org | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| # of this software and associated documentation files (the "Software"), to deal | |
| # in the Software without restriction, including without limitation the rights | |
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| # copies of the Software, and to permit persons to whom the Software is | |
| # furnished to do so, subject to the following conditions: | |
| # The above copyright notice and this permission notice shall be included in all | |
| # copies or substantial portions of the Software. | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| # SOFTWARE. | |
| # File author: Zhenyu Li | |
| import torch | |
| import torch.nn as nn | |
| import torch.nn.functional as F | |
| from zoedepth.models.layers.swin_layers import G2LFusion | |
| from zoedepth.models.layers.transformer import TransformerEncoder, TransformerEncoderLayer | |
| from torchvision.ops import roi_align as torch_roi_align | |
| class DoubleConvWOBN(nn.Module): | |
| """(convolution => [BN] => ReLU) * 2""" | |
| def __init__(self, in_channels, out_channels, mid_channels=None): | |
| super().__init__() | |
| if not mid_channels: | |
| mid_channels = out_channels | |
| self.double_conv = nn.Sequential( | |
| nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=True), | |
| # nn.BatchNorm2d(mid_channels), | |
| nn.ReLU(inplace=True), | |
| nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=True), | |
| # nn.BatchNorm2d(mid_channels), | |
| nn.ReLU(inplace=True)) | |
| def forward(self, x): | |
| return self.double_conv(x) | |
| class DoubleConv(nn.Module): | |
| """(convolution => [BN] => ReLU) * 2""" | |
| def __init__(self, in_channels, out_channels, mid_channels=None): | |
| super().__init__() | |
| if not mid_channels: | |
| mid_channels = out_channels | |
| self.double_conv = nn.Sequential( | |
| nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False), | |
| nn.BatchNorm2d(mid_channels), | |
| nn.ReLU(inplace=True), | |
| nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False), | |
| nn.BatchNorm2d(out_channels), | |
| nn.ReLU(inplace=True) | |
| ) | |
| def forward(self, x): | |
| return self.double_conv(x) | |
| class Down(nn.Module): | |
| """Downscaling with maxpool then double conv""" | |
| def __init__(self, in_channels, out_channels): | |
| super().__init__() | |
| self.maxpool_conv = nn.Sequential( | |
| nn.MaxPool2d(2), | |
| DoubleConv(in_channels, out_channels) | |
| ) | |
| def forward(self, x): | |
| return self.maxpool_conv(x) | |
| class Upv1(nn.Module): | |
| """Upscaling then double conv""" | |
| def __init__(self, in_channels, out_channels, mid_channels=None): | |
| super().__init__() | |
| self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True) | |
| if mid_channels is not None: | |
| self.conv = DoubleConvWOBN(in_channels, out_channels, mid_channels) | |
| else: | |
| self.conv = DoubleConvWOBN(in_channels, out_channels, in_channels) | |
| def forward(self, x1, x2): | |
| x1 = self.up(x1) | |
| x = torch.cat([x2, x1], dim=1) | |
| return self.conv(x) | |
| class UNetv1(nn.Module): | |
| def __init__(self, n_channels, g2l, pos_embed=False, use_area_prior=True): | |
| super(UNetv1, self).__init__() | |
| self.n_channels = n_channels | |
| self.inc = DoubleConv(n_channels, 32) | |
| self.down1 = Down(32, 256) | |
| self.down2 = Down(256, 256) | |
| self.down3 = Down(256, 256) | |
| self.down4 = Down(256, 256) | |
| self.down5 = Down(256, 256) | |
| self.up1 = Upv1(256+256+256, 256, 384) | |
| self.up2 = Upv1(256+256+256, 256, 384) | |
| self.up3 = Upv1(256+256+256, 256, 384) | |
| self.up4 = Upv1(256+256+256, 256, 384) | |
| self.up5 = Upv1(256+32+256, 32, 272) | |
| self.g2l = g2l | |
| if self.g2l: | |
| self.g2l_att = nn.ModuleList() | |
| win = 12 | |
| in_channels = [32, 256, 256, 256, 256, 256] | |
| crf_dims = [32, 256, 256, 256, 256, 256] | |
| self.g2l5 = G2LFusion(input_dim=in_channels[5], embed_dim=crf_dims[5], window_size=win, num_heads=32, depth=4, num_patches=12*16) | |
| self.g2l4 = G2LFusion(input_dim=in_channels[4], embed_dim=crf_dims[4], window_size=win, num_heads=32, depth=4, num_patches=24*32) | |
| self.g2l3 = G2LFusion(input_dim=in_channels[3], embed_dim=crf_dims[3], window_size=win, num_heads=16, depth=3, num_patches=48*64) | |
| self.g2l2 = G2LFusion(input_dim=in_channels[2], embed_dim=crf_dims[2], window_size=win, num_heads=16, depth=3, num_patches=96*128) | |
| self.g2l1 = G2LFusion(input_dim=in_channels[1], embed_dim=crf_dims[1], window_size=win, num_heads=8, depth=2, num_patches=192*256) | |
| self.g2l0 = G2LFusion(input_dim=in_channels[0], embed_dim=crf_dims[0], window_size=win, num_heads=8, depth=2, num_patches=384*512) | |
| self.conv5 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4]) | |
| self.conv4 = DoubleConvWOBN(in_channels[4] * 2, in_channels[4], in_channels[4]) | |
| self.conv3 = DoubleConvWOBN(in_channels[3] * 2, in_channels[3], in_channels[3]) | |
| self.conv2 = DoubleConvWOBN(in_channels[2] * 2, in_channels[2], in_channels[2]) | |
| self.conv1 = DoubleConvWOBN(in_channels[1] * 2, in_channels[1], in_channels[1]) | |
| self.conv0 = DoubleConvWOBN(in_channels[0] * 2, in_channels[0], in_channels[0]) | |
| def forward(self, | |
| input_tensor, | |
| guide_plus, | |
| guide_cat, | |
| crop_area_resize=None, | |
| bbox=None, | |
| fine_feat_crop=None, | |
| coarse_feat_whole=None, | |
| coarse_feat_whole_hack=None, | |
| coarse_feat_crop=None): | |
| # apply unscaled feat to swin | |
| if coarse_feat_whole_hack is not None: | |
| coarse_feat_whole = coarse_feat_whole_hack | |
| if crop_area_resize is None: | |
| not_use_prior = True | |
| else: | |
| not_use_prior = False | |
| x1 = self.inc(input_tensor) | |
| x2 = self.down1(x1) | |
| x3 = self.down2(x2) | |
| x4 = self.down3(x3) | |
| x5 = self.down4(x4) | |
| x6 = self.down5(x5) | |
| if self.g2l: | |
| g2l_feat5 = self.g2l5(coarse_feat_whole[0], crop_area_resize[0]) | |
| g2l_feat5 = torch_roi_align(g2l_feat5, bbox, (12, 16), 12/384, aligned=True) | |
| x6 = self.conv5(torch.cat([x6, g2l_feat5], dim=1)) | |
| x5 = self.up1(torch.cat([x6, guide_cat[0]], dim=1), x5) | |
| if self.g2l: | |
| g2l_feat4 = self.g2l4(coarse_feat_whole[1], crop_area_resize[1]) | |
| g2l_feat4 = torch_roi_align(g2l_feat4, bbox, (24, 32), 24/384, aligned=True) | |
| x5 = self.conv4(torch.cat([x5, g2l_feat4], dim=1)) | |
| x4 = self.up2(torch.cat([x5, guide_cat[1]], dim=1), x4) | |
| if self.g2l: | |
| g2l_feat3 = self.g2l3(coarse_feat_whole[2], crop_area_resize[2]) | |
| g2l_feat3 = torch_roi_align(g2l_feat3, bbox, (48, 64), 48/384, aligned=True) | |
| x4 = self.conv3(torch.cat([x4, g2l_feat3], dim=1)) | |
| x3 = self.up3(torch.cat([x4, guide_cat[2]], dim=1), x3) | |
| if self.g2l: | |
| g2l_feat2 = self.g2l2(coarse_feat_whole[3], crop_area_resize[3]) | |
| g2l_feat2 = torch_roi_align(g2l_feat2, bbox, (96, 128), 96/384, aligned=True) | |
| x3 = self.conv2(torch.cat([x3, g2l_feat2], dim=1)) | |
| x2 = self.up4(torch.cat([x3, guide_cat[3]], dim=1), x2) | |
| if self.g2l: | |
| g2l_feat1 = self.g2l1(coarse_feat_whole[4], crop_area_resize[4]) | |
| g2l_feat1 = torch_roi_align(g2l_feat1, bbox, (192, 256), 192/384, aligned=True) | |
| x2 = self.conv1(torch.cat([x2, g2l_feat1], dim=1)) | |
| x1 = self.up5(torch.cat([x2, guide_cat[4]], dim=1), x1) | |
| if self.g2l: | |
| g2l_feat0 = self.g2l0(coarse_feat_whole[5], crop_area_resize[5]) | |
| g2l_feat0 = torch_roi_align(g2l_feat0, bbox, (384, 512), 384/384, aligned=True) | |
| x1 = self.conv0(torch.cat([x1, g2l_feat0], dim=1)) | |
| output = [x1, x2, x3, x4, x5, x6] | |
| return output |