NeoPy commited on
Commit
e399705
·
verified ·
1 Parent(s): bc940cc

Delete infer/lib/uvr5_pack

Browse files
Files changed (43) hide show
  1. infer/lib/uvr5_pack/lib_v5/dataset.py +0 -183
  2. infer/lib/uvr5_pack/lib_v5/layers.py +0 -118
  3. infer/lib/uvr5_pack/lib_v5/layers_123812KB .py +0 -118
  4. infer/lib/uvr5_pack/lib_v5/layers_123821KB.py +0 -118
  5. infer/lib/uvr5_pack/lib_v5/layers_33966KB.py +0 -126
  6. infer/lib/uvr5_pack/lib_v5/layers_537227KB.py +0 -126
  7. infer/lib/uvr5_pack/lib_v5/layers_537238KB.py +0 -126
  8. infer/lib/uvr5_pack/lib_v5/layers_new.py +0 -125
  9. infer/lib/uvr5_pack/lib_v5/model_param_init.py +0 -69
  10. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json +0 -19
  11. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json +0 -19
  12. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json +0 -19
  13. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json +0 -19
  14. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json +0 -19
  15. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json +0 -19
  16. infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json +0 -19
  17. infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json +0 -30
  18. infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json +0 -30
  19. infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json +0 -30
  20. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json +0 -42
  21. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json +0 -43
  22. infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json +0 -43
  23. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json +0 -54
  24. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json +0 -55
  25. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json +0 -55
  26. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json +0 -55
  27. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json +0 -55
  28. infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json +0 -55
  29. infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json +0 -54
  30. infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json +0 -55
  31. infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json +0 -54
  32. infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json +0 -43
  33. infer/lib/uvr5_pack/lib_v5/nets.py +0 -123
  34. infer/lib/uvr5_pack/lib_v5/nets_123812KB.py +0 -122
  35. infer/lib/uvr5_pack/lib_v5/nets_123821KB.py +0 -122
  36. infer/lib/uvr5_pack/lib_v5/nets_33966KB.py +0 -122
  37. infer/lib/uvr5_pack/lib_v5/nets_537227KB.py +0 -123
  38. infer/lib/uvr5_pack/lib_v5/nets_537238KB.py +0 -123
  39. infer/lib/uvr5_pack/lib_v5/nets_61968KB.py +0 -122
  40. infer/lib/uvr5_pack/lib_v5/nets_new.py +0 -133
  41. infer/lib/uvr5_pack/lib_v5/spec_utils.py +0 -672
  42. infer/lib/uvr5_pack/name_params.json +0 -263
  43. infer/lib/uvr5_pack/utils.py +0 -121
infer/lib/uvr5_pack/lib_v5/dataset.py DELETED
@@ -1,183 +0,0 @@
1
- import os
2
- import random
3
-
4
- import numpy as np
5
- import torch
6
- import torch.utils.data
7
- from tqdm import tqdm
8
-
9
- from . import spec_utils
10
-
11
-
12
- class VocalRemoverValidationSet(torch.utils.data.Dataset):
13
- def __init__(self, patch_list):
14
- self.patch_list = patch_list
15
-
16
- def __len__(self):
17
- return len(self.patch_list)
18
-
19
- def __getitem__(self, idx):
20
- path = self.patch_list[idx]
21
- data = np.load(path)
22
-
23
- X, y = data["X"], data["y"]
24
-
25
- X_mag = np.abs(X)
26
- y_mag = np.abs(y)
27
-
28
- return X_mag, y_mag
29
-
30
-
31
- def make_pair(mix_dir, inst_dir):
32
- input_exts = [".wav", ".m4a", ".mp3", ".mp4", ".flac"]
33
-
34
- X_list = sorted(
35
- [
36
- os.path.join(mix_dir, fname)
37
- for fname in os.listdir(mix_dir)
38
- if os.path.splitext(fname)[1] in input_exts
39
- ]
40
- )
41
- y_list = sorted(
42
- [
43
- os.path.join(inst_dir, fname)
44
- for fname in os.listdir(inst_dir)
45
- if os.path.splitext(fname)[1] in input_exts
46
- ]
47
- )
48
-
49
- filelist = list(zip(X_list, y_list))
50
-
51
- return filelist
52
-
53
-
54
- def train_val_split(dataset_dir, split_mode, val_rate, val_filelist):
55
- if split_mode == "random":
56
- filelist = make_pair(
57
- os.path.join(dataset_dir, "mixtures"),
58
- os.path.join(dataset_dir, "instruments"),
59
- )
60
-
61
- random.shuffle(filelist)
62
-
63
- if len(val_filelist) == 0:
64
- val_size = int(len(filelist) * val_rate)
65
- train_filelist = filelist[:-val_size]
66
- val_filelist = filelist[-val_size:]
67
- else:
68
- train_filelist = [
69
- pair for pair in filelist if list(pair) not in val_filelist
70
- ]
71
- elif split_mode == "subdirs":
72
- if len(val_filelist) != 0:
73
- raise ValueError(
74
- "The `val_filelist` option is not available in `subdirs` mode"
75
- )
76
-
77
- train_filelist = make_pair(
78
- os.path.join(dataset_dir, "training/mixtures"),
79
- os.path.join(dataset_dir, "training/instruments"),
80
- )
81
-
82
- val_filelist = make_pair(
83
- os.path.join(dataset_dir, "validation/mixtures"),
84
- os.path.join(dataset_dir, "validation/instruments"),
85
- )
86
-
87
- return train_filelist, val_filelist
88
-
89
-
90
- def augment(X, y, reduction_rate, reduction_mask, mixup_rate, mixup_alpha):
91
- perm = np.random.permutation(len(X))
92
- for i, idx in enumerate(tqdm(perm)):
93
- if np.random.uniform() < reduction_rate:
94
- y[idx] = spec_utils.reduce_vocal_aggressively(
95
- X[idx], y[idx], reduction_mask
96
- )
97
-
98
- if np.random.uniform() < 0.5:
99
- # swap channel
100
- X[idx] = X[idx, ::-1]
101
- y[idx] = y[idx, ::-1]
102
- if np.random.uniform() < 0.02:
103
- # mono
104
- X[idx] = X[idx].mean(axis=0, keepdims=True)
105
- y[idx] = y[idx].mean(axis=0, keepdims=True)
106
- if np.random.uniform() < 0.02:
107
- # inst
108
- X[idx] = y[idx]
109
-
110
- if np.random.uniform() < mixup_rate and i < len(perm) - 1:
111
- lam = np.random.beta(mixup_alpha, mixup_alpha)
112
- X[idx] = lam * X[idx] + (1 - lam) * X[perm[i + 1]]
113
- y[idx] = lam * y[idx] + (1 - lam) * y[perm[i + 1]]
114
-
115
- return X, y
116
-
117
-
118
- def make_padding(width, cropsize, offset):
119
- left = offset
120
- roi_size = cropsize - left * 2
121
- if roi_size == 0:
122
- roi_size = cropsize
123
- right = roi_size - (width % roi_size) + left
124
-
125
- return left, right, roi_size
126
-
127
-
128
- def make_training_set(filelist, cropsize, patches, sr, hop_length, n_fft, offset):
129
- len_dataset = patches * len(filelist)
130
-
131
- X_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
132
- y_dataset = np.zeros((len_dataset, 2, n_fft // 2 + 1, cropsize), dtype=np.complex64)
133
-
134
- for i, (X_path, y_path) in enumerate(tqdm(filelist)):
135
- X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
136
- coef = np.max([np.abs(X).max(), np.abs(y).max()])
137
- X, y = X / coef, y / coef
138
-
139
- l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
140
- X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
141
- y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
142
-
143
- starts = np.random.randint(0, X_pad.shape[2] - cropsize, patches)
144
- ends = starts + cropsize
145
- for j in range(patches):
146
- idx = i * patches + j
147
- X_dataset[idx] = X_pad[:, :, starts[j] : ends[j]]
148
- y_dataset[idx] = y_pad[:, :, starts[j] : ends[j]]
149
-
150
- return X_dataset, y_dataset
151
-
152
-
153
- def make_validation_set(filelist, cropsize, sr, hop_length, n_fft, offset):
154
- patch_list = []
155
- patch_dir = "cs{}_sr{}_hl{}_nf{}_of{}".format(
156
- cropsize, sr, hop_length, n_fft, offset
157
- )
158
- os.makedirs(patch_dir, exist_ok=True)
159
-
160
- for i, (X_path, y_path) in enumerate(tqdm(filelist)):
161
- basename = os.path.splitext(os.path.basename(X_path))[0]
162
-
163
- X, y = spec_utils.cache_or_load(X_path, y_path, sr, hop_length, n_fft)
164
- coef = np.max([np.abs(X).max(), np.abs(y).max()])
165
- X, y = X / coef, y / coef
166
-
167
- l, r, roi_size = make_padding(X.shape[2], cropsize, offset)
168
- X_pad = np.pad(X, ((0, 0), (0, 0), (l, r)), mode="constant")
169
- y_pad = np.pad(y, ((0, 0), (0, 0), (l, r)), mode="constant")
170
-
171
- len_dataset = int(np.ceil(X.shape[2] / roi_size))
172
- for j in range(len_dataset):
173
- outpath = os.path.join(patch_dir, "{}_p{}.npz".format(basename, j))
174
- start = j * roi_size
175
- if not os.path.exists(outpath):
176
- np.savez(
177
- outpath,
178
- X=X_pad[:, :, start : start + cropsize],
179
- y=y_pad[:, :, start : start + cropsize],
180
- )
181
- patch_list.append(outpath)
182
-
183
- return VocalRemoverValidationSet(patch_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers.py DELETED
@@ -1,118 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.bottleneck = nn.Sequential(
104
- Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
- )
106
-
107
- def forward(self, x):
108
- _, _, h, w = x.size()
109
- feat1 = F.interpolate(
110
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
- )
112
- feat2 = self.conv2(x)
113
- feat3 = self.conv3(x)
114
- feat4 = self.conv4(x)
115
- feat5 = self.conv5(x)
116
- out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
- bottle = self.bottleneck(out)
118
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_123812KB .py DELETED
@@ -1,118 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.bottleneck = nn.Sequential(
104
- Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
- )
106
-
107
- def forward(self, x):
108
- _, _, h, w = x.size()
109
- feat1 = F.interpolate(
110
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
- )
112
- feat2 = self.conv2(x)
113
- feat3 = self.conv3(x)
114
- feat4 = self.conv4(x)
115
- feat5 = self.conv5(x)
116
- out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
- bottle = self.bottleneck(out)
118
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_123821KB.py DELETED
@@ -1,118 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.bottleneck = nn.Sequential(
104
- Conv2DBNActiv(nin * 5, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
105
- )
106
-
107
- def forward(self, x):
108
- _, _, h, w = x.size()
109
- feat1 = F.interpolate(
110
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
111
- )
112
- feat2 = self.conv2(x)
113
- feat3 = self.conv3(x)
114
- feat4 = self.conv4(x)
115
- feat5 = self.conv5(x)
116
- out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
117
- bottle = self.bottleneck(out)
118
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_33966KB.py DELETED
@@ -1,126 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.conv6 = SeperableConv2DBNActiv(
104
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
- )
106
- self.conv7 = SeperableConv2DBNActiv(
107
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
- )
109
- self.bottleneck = nn.Sequential(
110
- Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
- )
112
-
113
- def forward(self, x):
114
- _, _, h, w = x.size()
115
- feat1 = F.interpolate(
116
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
- )
118
- feat2 = self.conv2(x)
119
- feat3 = self.conv3(x)
120
- feat4 = self.conv4(x)
121
- feat5 = self.conv5(x)
122
- feat6 = self.conv6(x)
123
- feat7 = self.conv7(x)
124
- out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
- bottle = self.bottleneck(out)
126
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_537227KB.py DELETED
@@ -1,126 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.conv6 = SeperableConv2DBNActiv(
104
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
- )
106
- self.conv7 = SeperableConv2DBNActiv(
107
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
- )
109
- self.bottleneck = nn.Sequential(
110
- Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
- )
112
-
113
- def forward(self, x):
114
- _, _, h, w = x.size()
115
- feat1 = F.interpolate(
116
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
- )
118
- feat2 = self.conv2(x)
119
- feat3 = self.conv3(x)
120
- feat4 = self.conv4(x)
121
- feat5 = self.conv5(x)
122
- feat6 = self.conv6(x)
123
- feat7 = self.conv7(x)
124
- out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
- bottle = self.bottleneck(out)
126
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_537238KB.py DELETED
@@ -1,126 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class SeperableConv2DBNActiv(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
31
- super(SeperableConv2DBNActiv, self).__init__()
32
- self.conv = nn.Sequential(
33
- nn.Conv2d(
34
- nin,
35
- nin,
36
- kernel_size=ksize,
37
- stride=stride,
38
- padding=pad,
39
- dilation=dilation,
40
- groups=nin,
41
- bias=False,
42
- ),
43
- nn.Conv2d(nin, nout, kernel_size=1, bias=False),
44
- nn.BatchNorm2d(nout),
45
- activ(),
46
- )
47
-
48
- def __call__(self, x):
49
- return self.conv(x)
50
-
51
-
52
- class Encoder(nn.Module):
53
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
54
- super(Encoder, self).__init__()
55
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
56
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, stride, pad, activ=activ)
57
-
58
- def __call__(self, x):
59
- skip = self.conv1(x)
60
- h = self.conv2(skip)
61
-
62
- return h, skip
63
-
64
-
65
- class Decoder(nn.Module):
66
- def __init__(
67
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
68
- ):
69
- super(Decoder, self).__init__()
70
- self.conv = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
71
- self.dropout = nn.Dropout2d(0.1) if dropout else None
72
-
73
- def __call__(self, x, skip=None):
74
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
75
- if skip is not None:
76
- skip = spec_utils.crop_center(skip, x)
77
- x = torch.cat([x, skip], dim=1)
78
- h = self.conv(x)
79
-
80
- if self.dropout is not None:
81
- h = self.dropout(h)
82
-
83
- return h
84
-
85
-
86
- class ASPPModule(nn.Module):
87
- def __init__(self, nin, nout, dilations=(4, 8, 16, 32, 64), activ=nn.ReLU):
88
- super(ASPPModule, self).__init__()
89
- self.conv1 = nn.Sequential(
90
- nn.AdaptiveAvgPool2d((1, None)),
91
- Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ),
92
- )
93
- self.conv2 = Conv2DBNActiv(nin, nin, 1, 1, 0, activ=activ)
94
- self.conv3 = SeperableConv2DBNActiv(
95
- nin, nin, 3, 1, dilations[0], dilations[0], activ=activ
96
- )
97
- self.conv4 = SeperableConv2DBNActiv(
98
- nin, nin, 3, 1, dilations[1], dilations[1], activ=activ
99
- )
100
- self.conv5 = SeperableConv2DBNActiv(
101
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
102
- )
103
- self.conv6 = SeperableConv2DBNActiv(
104
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
105
- )
106
- self.conv7 = SeperableConv2DBNActiv(
107
- nin, nin, 3, 1, dilations[2], dilations[2], activ=activ
108
- )
109
- self.bottleneck = nn.Sequential(
110
- Conv2DBNActiv(nin * 7, nout, 1, 1, 0, activ=activ), nn.Dropout2d(0.1)
111
- )
112
-
113
- def forward(self, x):
114
- _, _, h, w = x.size()
115
- feat1 = F.interpolate(
116
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
117
- )
118
- feat2 = self.conv2(x)
119
- feat3 = self.conv3(x)
120
- feat4 = self.conv4(x)
121
- feat5 = self.conv5(x)
122
- feat6 = self.conv6(x)
123
- feat7 = self.conv7(x)
124
- out = torch.cat((feat1, feat2, feat3, feat4, feat5, feat6, feat7), dim=1)
125
- bottle = self.bottleneck(out)
126
- return bottle
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/layers_new.py DELETED
@@ -1,125 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import spec_utils
6
-
7
-
8
- class Conv2DBNActiv(nn.Module):
9
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, dilation=1, activ=nn.ReLU):
10
- super(Conv2DBNActiv, self).__init__()
11
- self.conv = nn.Sequential(
12
- nn.Conv2d(
13
- nin,
14
- nout,
15
- kernel_size=ksize,
16
- stride=stride,
17
- padding=pad,
18
- dilation=dilation,
19
- bias=False,
20
- ),
21
- nn.BatchNorm2d(nout),
22
- activ(),
23
- )
24
-
25
- def __call__(self, x):
26
- return self.conv(x)
27
-
28
-
29
- class Encoder(nn.Module):
30
- def __init__(self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.LeakyReLU):
31
- super(Encoder, self).__init__()
32
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, stride, pad, activ=activ)
33
- self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
34
-
35
- def __call__(self, x):
36
- h = self.conv1(x)
37
- h = self.conv2(h)
38
-
39
- return h
40
-
41
-
42
- class Decoder(nn.Module):
43
- def __init__(
44
- self, nin, nout, ksize=3, stride=1, pad=1, activ=nn.ReLU, dropout=False
45
- ):
46
- super(Decoder, self).__init__()
47
- self.conv1 = Conv2DBNActiv(nin, nout, ksize, 1, pad, activ=activ)
48
- # self.conv2 = Conv2DBNActiv(nout, nout, ksize, 1, pad, activ=activ)
49
- self.dropout = nn.Dropout2d(0.1) if dropout else None
50
-
51
- def __call__(self, x, skip=None):
52
- x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=True)
53
-
54
- if skip is not None:
55
- skip = spec_utils.crop_center(skip, x)
56
- x = torch.cat([x, skip], dim=1)
57
-
58
- h = self.conv1(x)
59
- # h = self.conv2(h)
60
-
61
- if self.dropout is not None:
62
- h = self.dropout(h)
63
-
64
- return h
65
-
66
-
67
- class ASPPModule(nn.Module):
68
- def __init__(self, nin, nout, dilations=(4, 8, 12), activ=nn.ReLU, dropout=False):
69
- super(ASPPModule, self).__init__()
70
- self.conv1 = nn.Sequential(
71
- nn.AdaptiveAvgPool2d((1, None)),
72
- Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ),
73
- )
74
- self.conv2 = Conv2DBNActiv(nin, nout, 1, 1, 0, activ=activ)
75
- self.conv3 = Conv2DBNActiv(
76
- nin, nout, 3, 1, dilations[0], dilations[0], activ=activ
77
- )
78
- self.conv4 = Conv2DBNActiv(
79
- nin, nout, 3, 1, dilations[1], dilations[1], activ=activ
80
- )
81
- self.conv5 = Conv2DBNActiv(
82
- nin, nout, 3, 1, dilations[2], dilations[2], activ=activ
83
- )
84
- self.bottleneck = Conv2DBNActiv(nout * 5, nout, 1, 1, 0, activ=activ)
85
- self.dropout = nn.Dropout2d(0.1) if dropout else None
86
-
87
- def forward(self, x):
88
- _, _, h, w = x.size()
89
- feat1 = F.interpolate(
90
- self.conv1(x), size=(h, w), mode="bilinear", align_corners=True
91
- )
92
- feat2 = self.conv2(x)
93
- feat3 = self.conv3(x)
94
- feat4 = self.conv4(x)
95
- feat5 = self.conv5(x)
96
- out = torch.cat((feat1, feat2, feat3, feat4, feat5), dim=1)
97
- out = self.bottleneck(out)
98
-
99
- if self.dropout is not None:
100
- out = self.dropout(out)
101
-
102
- return out
103
-
104
-
105
- class LSTMModule(nn.Module):
106
- def __init__(self, nin_conv, nin_lstm, nout_lstm):
107
- super(LSTMModule, self).__init__()
108
- self.conv = Conv2DBNActiv(nin_conv, 1, 1, 1, 0)
109
- self.lstm = nn.LSTM(
110
- input_size=nin_lstm, hidden_size=nout_lstm // 2, bidirectional=True
111
- )
112
- self.dense = nn.Sequential(
113
- nn.Linear(nout_lstm, nin_lstm), nn.BatchNorm1d(nin_lstm), nn.ReLU()
114
- )
115
-
116
- def forward(self, x):
117
- N, _, nbins, nframes = x.size()
118
- h = self.conv(x)[:, 0] # N, nbins, nframes
119
- h = h.permute(2, 0, 1) # nframes, N, nbins
120
- h, _ = self.lstm(h)
121
- h = self.dense(h.reshape(-1, h.size()[-1])) # nframes * N, nbins
122
- h = h.reshape(nframes, N, 1, nbins)
123
- h = h.permute(1, 2, 3, 0)
124
-
125
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/model_param_init.py DELETED
@@ -1,69 +0,0 @@
1
- import json
2
- import os
3
- import pathlib
4
-
5
- default_param = {}
6
- default_param["bins"] = 768
7
- default_param["unstable_bins"] = 9 # training only
8
- default_param["reduction_bins"] = 762 # training only
9
- default_param["sr"] = 44100
10
- default_param["pre_filter_start"] = 757
11
- default_param["pre_filter_stop"] = 768
12
- default_param["band"] = {}
13
-
14
-
15
- default_param["band"][1] = {
16
- "sr": 11025,
17
- "hl": 128,
18
- "n_fft": 960,
19
- "crop_start": 0,
20
- "crop_stop": 245,
21
- "lpf_start": 61, # inference only
22
- "res_type": "polyphase",
23
- }
24
-
25
- default_param["band"][2] = {
26
- "sr": 44100,
27
- "hl": 512,
28
- "n_fft": 1536,
29
- "crop_start": 24,
30
- "crop_stop": 547,
31
- "hpf_start": 81, # inference only
32
- "res_type": "sinc_best",
33
- }
34
-
35
-
36
- def int_keys(d):
37
- r = {}
38
- for k, v in d:
39
- if k.isdigit():
40
- k = int(k)
41
- r[k] = v
42
- return r
43
-
44
-
45
- class ModelParameters(object):
46
- def __init__(self, config_path=""):
47
- if ".pth" == pathlib.Path(config_path).suffix:
48
- import zipfile
49
-
50
- with zipfile.ZipFile(config_path, "r") as zip:
51
- self.param = json.loads(
52
- zip.read("param.json"), object_pairs_hook=int_keys
53
- )
54
- elif ".json" == pathlib.Path(config_path).suffix:
55
- with open(config_path, "r") as f:
56
- self.param = json.loads(f.read(), object_pairs_hook=int_keys)
57
- else:
58
- self.param = default_param
59
-
60
- for k in [
61
- "mid_side",
62
- "mid_side_b",
63
- "mid_side_b2",
64
- "stereo_w",
65
- "stereo_n",
66
- "reverse",
67
- ]:
68
- if not k in self.param:
69
- self.param[k] = False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 16000,
8
- "hl": 512,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 16000,
17
- "pre_filter_start": 1023,
18
- "pre_filter_stop": 1024
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 32000,
8
- "hl": 512,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "kaiser_fast"
14
- }
15
- },
16
- "sr": 32000,
17
- "pre_filter_start": 1000,
18
- "pre_filter_stop": 1021
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 33075,
8
- "hl": 384,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 33075,
17
- "pre_filter_start": 1000,
18
- "pre_filter_stop": 1021
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 44100,
8
- "hl": 1024,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 44100,
17
- "pre_filter_start": 1023,
18
- "pre_filter_stop": 1024
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 256,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 44100,
8
- "hl": 256,
9
- "n_fft": 512,
10
- "crop_start": 0,
11
- "crop_stop": 256,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 44100,
17
- "pre_filter_start": 256,
18
- "pre_filter_stop": 256
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 44100,
8
- "hl": 512,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 1024,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 44100,
17
- "pre_filter_start": 1023,
18
- "pre_filter_stop": 1024
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512_cut.json DELETED
@@ -1,19 +0,0 @@
1
- {
2
- "bins": 1024,
3
- "unstable_bins": 0,
4
- "reduction_bins": 0,
5
- "band": {
6
- "1": {
7
- "sr": 44100,
8
- "hl": 512,
9
- "n_fft": 2048,
10
- "crop_start": 0,
11
- "crop_stop": 700,
12
- "hpf_start": -1,
13
- "res_type": "sinc_best"
14
- }
15
- },
16
- "sr": 44100,
17
- "pre_filter_start": 1023,
18
- "pre_filter_stop": 700
19
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 7,
4
- "reduction_bins": 705,
5
- "band": {
6
- "1": {
7
- "sr": 6000,
8
- "hl": 66,
9
- "n_fft": 512,
10
- "crop_start": 0,
11
- "crop_stop": 240,
12
- "lpf_start": 60,
13
- "lpf_stop": 118,
14
- "res_type": "sinc_fastest"
15
- },
16
- "2": {
17
- "sr": 32000,
18
- "hl": 352,
19
- "n_fft": 1024,
20
- "crop_start": 22,
21
- "crop_stop": 505,
22
- "hpf_start": 44,
23
- "hpf_stop": 23,
24
- "res_type": "sinc_medium"
25
- }
26
- },
27
- "sr": 32000,
28
- "pre_filter_start": 710,
29
- "pre_filter_stop": 731
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bins": 512,
3
- "unstable_bins": 7,
4
- "reduction_bins": 510,
5
- "band": {
6
- "1": {
7
- "sr": 11025,
8
- "hl": 160,
9
- "n_fft": 768,
10
- "crop_start": 0,
11
- "crop_stop": 192,
12
- "lpf_start": 41,
13
- "lpf_stop": 139,
14
- "res_type": "sinc_fastest"
15
- },
16
- "2": {
17
- "sr": 44100,
18
- "hl": 640,
19
- "n_fft": 1024,
20
- "crop_start": 10,
21
- "crop_stop": 320,
22
- "hpf_start": 47,
23
- "hpf_stop": 15,
24
- "res_type": "sinc_medium"
25
- }
26
- },
27
- "sr": 44100,
28
- "pre_filter_start": 510,
29
- "pre_filter_stop": 512
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json DELETED
@@ -1,30 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 7,
4
- "reduction_bins": 705,
5
- "band": {
6
- "1": {
7
- "sr": 6000,
8
- "hl": 66,
9
- "n_fft": 512,
10
- "crop_start": 0,
11
- "crop_stop": 240,
12
- "lpf_start": 60,
13
- "lpf_stop": 240,
14
- "res_type": "sinc_fastest"
15
- },
16
- "2": {
17
- "sr": 48000,
18
- "hl": 528,
19
- "n_fft": 1536,
20
- "crop_start": 22,
21
- "crop_stop": 505,
22
- "hpf_start": 82,
23
- "hpf_stop": 22,
24
- "res_type": "sinc_medium"
25
- }
26
- },
27
- "sr": 48000,
28
- "pre_filter_start": 710,
29
- "pre_filter_stop": 731
30
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json DELETED
@@ -1,42 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 5,
4
- "reduction_bins": 733,
5
- "band": {
6
- "1": {
7
- "sr": 11025,
8
- "hl": 128,
9
- "n_fft": 768,
10
- "crop_start": 0,
11
- "crop_stop": 278,
12
- "lpf_start": 28,
13
- "lpf_stop": 140,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 22050,
18
- "hl": 256,
19
- "n_fft": 768,
20
- "crop_start": 14,
21
- "crop_stop": 322,
22
- "hpf_start": 70,
23
- "hpf_stop": 14,
24
- "lpf_start": 283,
25
- "lpf_stop": 314,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 44100,
30
- "hl": 512,
31
- "n_fft": 768,
32
- "crop_start": 131,
33
- "crop_stop": 313,
34
- "hpf_start": 154,
35
- "hpf_stop": 141,
36
- "res_type": "sinc_medium"
37
- }
38
- },
39
- "sr": 44100,
40
- "pre_filter_start": 757,
41
- "pre_filter_stop": 768
42
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "mid_side": true,
3
- "bins": 768,
4
- "unstable_bins": 5,
5
- "reduction_bins": 733,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 768,
11
- "crop_start": 0,
12
- "crop_stop": 278,
13
- "lpf_start": 28,
14
- "lpf_stop": 140,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 22050,
19
- "hl": 256,
20
- "n_fft": 768,
21
- "crop_start": 14,
22
- "crop_stop": 322,
23
- "hpf_start": 70,
24
- "hpf_stop": 14,
25
- "lpf_start": 283,
26
- "lpf_stop": 314,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 44100,
31
- "hl": 512,
32
- "n_fft": 768,
33
- "crop_start": 131,
34
- "crop_stop": 313,
35
- "hpf_start": 154,
36
- "hpf_stop": 141,
37
- "res_type": "sinc_medium"
38
- }
39
- },
40
- "sr": 44100,
41
- "pre_filter_start": 757,
42
- "pre_filter_stop": 768
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "mid_side_b2": true,
3
- "bins": 640,
4
- "unstable_bins": 7,
5
- "reduction_bins": 565,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 108,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 187,
13
- "lpf_start": 92,
14
- "lpf_stop": 186,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 22050,
19
- "hl": 216,
20
- "n_fft": 768,
21
- "crop_start": 0,
22
- "crop_stop": 212,
23
- "hpf_start": 68,
24
- "hpf_stop": 34,
25
- "lpf_start": 174,
26
- "lpf_stop": 209,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 44100,
31
- "hl": 432,
32
- "n_fft": 640,
33
- "crop_start": 66,
34
- "crop_stop": 307,
35
- "hpf_start": 86,
36
- "hpf_stop": 72,
37
- "res_type": "kaiser_fast"
38
- }
39
- },
40
- "sr": 44100,
41
- "pre_filter_start": 639,
42
- "pre_filter_stop": 640
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 7,
4
- "reduction_bins": 668,
5
- "band": {
6
- "1": {
7
- "sr": 11025,
8
- "hl": 128,
9
- "n_fft": 1024,
10
- "crop_start": 0,
11
- "crop_stop": 186,
12
- "lpf_start": 37,
13
- "lpf_stop": 73,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 11025,
18
- "hl": 128,
19
- "n_fft": 512,
20
- "crop_start": 4,
21
- "crop_stop": 185,
22
- "hpf_start": 36,
23
- "hpf_stop": 18,
24
- "lpf_start": 93,
25
- "lpf_stop": 185,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 22050,
30
- "hl": 256,
31
- "n_fft": 512,
32
- "crop_start": 46,
33
- "crop_stop": 186,
34
- "hpf_start": 93,
35
- "hpf_stop": 46,
36
- "lpf_start": 164,
37
- "lpf_stop": 186,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 512,
43
- "n_fft": 768,
44
- "crop_start": 121,
45
- "crop_stop": 382,
46
- "hpf_start": 138,
47
- "hpf_stop": 123,
48
- "res_type": "sinc_medium"
49
- }
50
- },
51
- "sr": 44100,
52
- "pre_filter_start": 740,
53
- "pre_filter_stop": 768
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "bins": 768,
3
- "unstable_bins": 7,
4
- "mid_side": true,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "mid_side_b": true,
3
- "bins": 768,
4
- "unstable_bins": 7,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "mid_side_b": true,
3
- "bins": 768,
4
- "unstable_bins": 7,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "reverse": true,
3
- "bins": 768,
4
- "unstable_bins": 7,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "stereo_w": true,
3
- "bins": 768,
4
- "unstable_bins": 7,
5
- "reduction_bins": 668,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 128,
10
- "n_fft": 1024,
11
- "crop_start": 0,
12
- "crop_stop": 186,
13
- "lpf_start": 37,
14
- "lpf_stop": 73,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 11025,
19
- "hl": 128,
20
- "n_fft": 512,
21
- "crop_start": 4,
22
- "crop_stop": 185,
23
- "hpf_start": 36,
24
- "hpf_stop": 18,
25
- "lpf_start": 93,
26
- "lpf_stop": 185,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 22050,
31
- "hl": 256,
32
- "n_fft": 512,
33
- "crop_start": 46,
34
- "crop_stop": 186,
35
- "hpf_start": 93,
36
- "hpf_stop": 46,
37
- "lpf_start": 164,
38
- "lpf_stop": 186,
39
- "res_type": "polyphase"
40
- },
41
- "4": {
42
- "sr": 44100,
43
- "hl": 512,
44
- "n_fft": 768,
45
- "crop_start": 121,
46
- "crop_stop": 382,
47
- "hpf_start": 138,
48
- "hpf_stop": 123,
49
- "res_type": "sinc_medium"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 740,
54
- "pre_filter_stop": 768
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "bins": 672,
3
- "unstable_bins": 8,
4
- "reduction_bins": 637,
5
- "band": {
6
- "1": {
7
- "sr": 7350,
8
- "hl": 80,
9
- "n_fft": 640,
10
- "crop_start": 0,
11
- "crop_stop": 85,
12
- "lpf_start": 25,
13
- "lpf_stop": 53,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 7350,
18
- "hl": 80,
19
- "n_fft": 320,
20
- "crop_start": 4,
21
- "crop_stop": 87,
22
- "hpf_start": 25,
23
- "hpf_stop": 12,
24
- "lpf_start": 31,
25
- "lpf_stop": 62,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 14700,
30
- "hl": 160,
31
- "n_fft": 512,
32
- "crop_start": 17,
33
- "crop_stop": 216,
34
- "hpf_start": 48,
35
- "hpf_stop": 24,
36
- "lpf_start": 139,
37
- "lpf_stop": 210,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 480,
43
- "n_fft": 960,
44
- "crop_start": 78,
45
- "crop_stop": 383,
46
- "hpf_start": 130,
47
- "hpf_stop": 86,
48
- "res_type": "kaiser_fast"
49
- }
50
- },
51
- "sr": 44100,
52
- "pre_filter_start": 668,
53
- "pre_filter_stop": 672
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json DELETED
@@ -1,55 +0,0 @@
1
- {
2
- "bins": 672,
3
- "unstable_bins": 8,
4
- "reduction_bins": 637,
5
- "band": {
6
- "1": {
7
- "sr": 7350,
8
- "hl": 80,
9
- "n_fft": 640,
10
- "crop_start": 0,
11
- "crop_stop": 85,
12
- "lpf_start": 25,
13
- "lpf_stop": 53,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 7350,
18
- "hl": 80,
19
- "n_fft": 320,
20
- "crop_start": 4,
21
- "crop_stop": 87,
22
- "hpf_start": 25,
23
- "hpf_stop": 12,
24
- "lpf_start": 31,
25
- "lpf_stop": 62,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 14700,
30
- "hl": 160,
31
- "n_fft": 512,
32
- "crop_start": 17,
33
- "crop_stop": 216,
34
- "hpf_start": 48,
35
- "hpf_stop": 24,
36
- "lpf_start": 139,
37
- "lpf_stop": 210,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 480,
43
- "n_fft": 960,
44
- "crop_start": 78,
45
- "crop_stop": 383,
46
- "hpf_start": 130,
47
- "hpf_stop": 86,
48
- "convert_channels": "stereo_n",
49
- "res_type": "kaiser_fast"
50
- }
51
- },
52
- "sr": 44100,
53
- "pre_filter_start": 668,
54
- "pre_filter_stop": 672
55
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/4band_v3.json DELETED
@@ -1,54 +0,0 @@
1
- {
2
- "bins": 672,
3
- "unstable_bins": 8,
4
- "reduction_bins": 530,
5
- "band": {
6
- "1": {
7
- "sr": 7350,
8
- "hl": 80,
9
- "n_fft": 640,
10
- "crop_start": 0,
11
- "crop_stop": 85,
12
- "lpf_start": 25,
13
- "lpf_stop": 53,
14
- "res_type": "polyphase"
15
- },
16
- "2": {
17
- "sr": 7350,
18
- "hl": 80,
19
- "n_fft": 320,
20
- "crop_start": 4,
21
- "crop_stop": 87,
22
- "hpf_start": 25,
23
- "hpf_stop": 12,
24
- "lpf_start": 31,
25
- "lpf_stop": 62,
26
- "res_type": "polyphase"
27
- },
28
- "3": {
29
- "sr": 14700,
30
- "hl": 160,
31
- "n_fft": 512,
32
- "crop_start": 17,
33
- "crop_stop": 216,
34
- "hpf_start": 48,
35
- "hpf_stop": 24,
36
- "lpf_start": 139,
37
- "lpf_stop": 210,
38
- "res_type": "polyphase"
39
- },
40
- "4": {
41
- "sr": 44100,
42
- "hl": 480,
43
- "n_fft": 960,
44
- "crop_start": 78,
45
- "crop_stop": 383,
46
- "hpf_start": 130,
47
- "hpf_stop": 86,
48
- "res_type": "kaiser_fast"
49
- }
50
- },
51
- "sr": 44100,
52
- "pre_filter_start": 668,
53
- "pre_filter_stop": 672
54
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/modelparams/ensemble.json DELETED
@@ -1,43 +0,0 @@
1
- {
2
- "mid_side_b2": true,
3
- "bins": 1280,
4
- "unstable_bins": 7,
5
- "reduction_bins": 565,
6
- "band": {
7
- "1": {
8
- "sr": 11025,
9
- "hl": 108,
10
- "n_fft": 2048,
11
- "crop_start": 0,
12
- "crop_stop": 374,
13
- "lpf_start": 92,
14
- "lpf_stop": 186,
15
- "res_type": "polyphase"
16
- },
17
- "2": {
18
- "sr": 22050,
19
- "hl": 216,
20
- "n_fft": 1536,
21
- "crop_start": 0,
22
- "crop_stop": 424,
23
- "hpf_start": 68,
24
- "hpf_stop": 34,
25
- "lpf_start": 348,
26
- "lpf_stop": 418,
27
- "res_type": "polyphase"
28
- },
29
- "3": {
30
- "sr": 44100,
31
- "hl": 432,
32
- "n_fft": 1280,
33
- "crop_start": 132,
34
- "crop_stop": 614,
35
- "hpf_start": 172,
36
- "hpf_stop": 144,
37
- "res_type": "polyphase"
38
- }
39
- },
40
- "sr": 44100,
41
- "pre_filter_start": 1280,
42
- "pre_filter_stop": 1280
43
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets.py DELETED
@@ -1,123 +0,0 @@
1
- import layers
2
- import torch
3
- import torch.nn.functional as F
4
- from torch import nn
5
-
6
- from . import spec_utils
7
-
8
-
9
- class BaseASPPNet(nn.Module):
10
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
- super(BaseASPPNet, self).__init__()
12
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
-
17
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
-
19
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
-
24
- def __call__(self, x):
25
- h, e1 = self.enc1(x)
26
- h, e2 = self.enc2(h)
27
- h, e3 = self.enc3(h)
28
- h, e4 = self.enc4(h)
29
-
30
- h = self.aspp(h)
31
-
32
- h = self.dec4(h, e4)
33
- h = self.dec3(h, e3)
34
- h = self.dec2(h, e2)
35
- h = self.dec1(h, e1)
36
-
37
- return h
38
-
39
-
40
- class CascadedASPPNet(nn.Module):
41
- def __init__(self, n_fft):
42
- super(CascadedASPPNet, self).__init__()
43
- self.stg1_low_band_net = BaseASPPNet(2, 16)
44
- self.stg1_high_band_net = BaseASPPNet(2, 16)
45
-
46
- self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
47
- self.stg2_full_band_net = BaseASPPNet(8, 16)
48
-
49
- self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
50
- self.stg3_full_band_net = BaseASPPNet(16, 32)
51
-
52
- self.out = nn.Conv2d(32, 2, 1, bias=False)
53
- self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
54
- self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
55
-
56
- self.max_bin = n_fft // 2
57
- self.output_bin = n_fft // 2 + 1
58
-
59
- self.offset = 128
60
-
61
- def forward(self, x, aggressiveness=None):
62
- mix = x.detach()
63
- x = x.clone()
64
-
65
- x = x[:, :, : self.max_bin]
66
-
67
- bandw = x.size()[2] // 2
68
- aux1 = torch.cat(
69
- [
70
- self.stg1_low_band_net(x[:, :, :bandw]),
71
- self.stg1_high_band_net(x[:, :, bandw:]),
72
- ],
73
- dim=2,
74
- )
75
-
76
- h = torch.cat([x, aux1], dim=1)
77
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
-
79
- h = torch.cat([x, aux1, aux2], dim=1)
80
- h = self.stg3_full_band_net(self.stg3_bridge(h))
81
-
82
- mask = torch.sigmoid(self.out(h))
83
- mask = F.pad(
84
- input=mask,
85
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
- mode="replicate",
87
- )
88
-
89
- if self.training:
90
- aux1 = torch.sigmoid(self.aux1_out(aux1))
91
- aux1 = F.pad(
92
- input=aux1,
93
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
- mode="replicate",
95
- )
96
- aux2 = torch.sigmoid(self.aux2_out(aux2))
97
- aux2 = F.pad(
98
- input=aux2,
99
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
- mode="replicate",
101
- )
102
- return mask * mix, aux1 * mix, aux2 * mix
103
- else:
104
- if aggressiveness:
105
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
- mask[:, :, : aggressiveness["split_bin"]],
107
- 1 + aggressiveness["value"] / 3,
108
- )
109
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
- mask[:, :, aggressiveness["split_bin"] :],
111
- 1 + aggressiveness["value"],
112
- )
113
-
114
- return mask * mix
115
-
116
- def predict(self, x_mag, aggressiveness=None):
117
- h = self.forward(x_mag, aggressiveness)
118
-
119
- if self.offset > 0:
120
- h = h[:, :, :, self.offset : -self.offset]
121
- assert h.size()[3] > 0
122
-
123
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_123812KB.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_123821KB as layers
6
-
7
-
8
- class BaseASPPNet(nn.Module):
9
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
- super(BaseASPPNet, self).__init__()
11
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
-
16
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
-
18
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
-
23
- def __call__(self, x):
24
- h, e1 = self.enc1(x)
25
- h, e2 = self.enc2(h)
26
- h, e3 = self.enc3(h)
27
- h, e4 = self.enc4(h)
28
-
29
- h = self.aspp(h)
30
-
31
- h = self.dec4(h, e4)
32
- h = self.dec3(h, e3)
33
- h = self.dec2(h, e2)
34
- h = self.dec1(h, e1)
35
-
36
- return h
37
-
38
-
39
- class CascadedASPPNet(nn.Module):
40
- def __init__(self, n_fft):
41
- super(CascadedASPPNet, self).__init__()
42
- self.stg1_low_band_net = BaseASPPNet(2, 32)
43
- self.stg1_high_band_net = BaseASPPNet(2, 32)
44
-
45
- self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
- self.stg2_full_band_net = BaseASPPNet(16, 32)
47
-
48
- self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
- self.stg3_full_band_net = BaseASPPNet(32, 64)
50
-
51
- self.out = nn.Conv2d(64, 2, 1, bias=False)
52
- self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
- self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
-
55
- self.max_bin = n_fft // 2
56
- self.output_bin = n_fft // 2 + 1
57
-
58
- self.offset = 128
59
-
60
- def forward(self, x, aggressiveness=None):
61
- mix = x.detach()
62
- x = x.clone()
63
-
64
- x = x[:, :, : self.max_bin]
65
-
66
- bandw = x.size()[2] // 2
67
- aux1 = torch.cat(
68
- [
69
- self.stg1_low_band_net(x[:, :, :bandw]),
70
- self.stg1_high_band_net(x[:, :, bandw:]),
71
- ],
72
- dim=2,
73
- )
74
-
75
- h = torch.cat([x, aux1], dim=1)
76
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
-
78
- h = torch.cat([x, aux1, aux2], dim=1)
79
- h = self.stg3_full_band_net(self.stg3_bridge(h))
80
-
81
- mask = torch.sigmoid(self.out(h))
82
- mask = F.pad(
83
- input=mask,
84
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
- mode="replicate",
86
- )
87
-
88
- if self.training:
89
- aux1 = torch.sigmoid(self.aux1_out(aux1))
90
- aux1 = F.pad(
91
- input=aux1,
92
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
- mode="replicate",
94
- )
95
- aux2 = torch.sigmoid(self.aux2_out(aux2))
96
- aux2 = F.pad(
97
- input=aux2,
98
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
- mode="replicate",
100
- )
101
- return mask * mix, aux1 * mix, aux2 * mix
102
- else:
103
- if aggressiveness:
104
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
- mask[:, :, : aggressiveness["split_bin"]],
106
- 1 + aggressiveness["value"] / 3,
107
- )
108
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
- mask[:, :, aggressiveness["split_bin"] :],
110
- 1 + aggressiveness["value"],
111
- )
112
-
113
- return mask * mix
114
-
115
- def predict(self, x_mag, aggressiveness=None):
116
- h = self.forward(x_mag, aggressiveness)
117
-
118
- if self.offset > 0:
119
- h = h[:, :, :, self.offset : -self.offset]
120
- assert h.size()[3] > 0
121
-
122
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_123821KB.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_123821KB as layers
6
-
7
-
8
- class BaseASPPNet(nn.Module):
9
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
- super(BaseASPPNet, self).__init__()
11
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
-
16
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
-
18
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
-
23
- def __call__(self, x):
24
- h, e1 = self.enc1(x)
25
- h, e2 = self.enc2(h)
26
- h, e3 = self.enc3(h)
27
- h, e4 = self.enc4(h)
28
-
29
- h = self.aspp(h)
30
-
31
- h = self.dec4(h, e4)
32
- h = self.dec3(h, e3)
33
- h = self.dec2(h, e2)
34
- h = self.dec1(h, e1)
35
-
36
- return h
37
-
38
-
39
- class CascadedASPPNet(nn.Module):
40
- def __init__(self, n_fft):
41
- super(CascadedASPPNet, self).__init__()
42
- self.stg1_low_band_net = BaseASPPNet(2, 32)
43
- self.stg1_high_band_net = BaseASPPNet(2, 32)
44
-
45
- self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
- self.stg2_full_band_net = BaseASPPNet(16, 32)
47
-
48
- self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
- self.stg3_full_band_net = BaseASPPNet(32, 64)
50
-
51
- self.out = nn.Conv2d(64, 2, 1, bias=False)
52
- self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
- self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
-
55
- self.max_bin = n_fft // 2
56
- self.output_bin = n_fft // 2 + 1
57
-
58
- self.offset = 128
59
-
60
- def forward(self, x, aggressiveness=None):
61
- mix = x.detach()
62
- x = x.clone()
63
-
64
- x = x[:, :, : self.max_bin]
65
-
66
- bandw = x.size()[2] // 2
67
- aux1 = torch.cat(
68
- [
69
- self.stg1_low_band_net(x[:, :, :bandw]),
70
- self.stg1_high_band_net(x[:, :, bandw:]),
71
- ],
72
- dim=2,
73
- )
74
-
75
- h = torch.cat([x, aux1], dim=1)
76
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
-
78
- h = torch.cat([x, aux1, aux2], dim=1)
79
- h = self.stg3_full_band_net(self.stg3_bridge(h))
80
-
81
- mask = torch.sigmoid(self.out(h))
82
- mask = F.pad(
83
- input=mask,
84
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
- mode="replicate",
86
- )
87
-
88
- if self.training:
89
- aux1 = torch.sigmoid(self.aux1_out(aux1))
90
- aux1 = F.pad(
91
- input=aux1,
92
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
- mode="replicate",
94
- )
95
- aux2 = torch.sigmoid(self.aux2_out(aux2))
96
- aux2 = F.pad(
97
- input=aux2,
98
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
- mode="replicate",
100
- )
101
- return mask * mix, aux1 * mix, aux2 * mix
102
- else:
103
- if aggressiveness:
104
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
- mask[:, :, : aggressiveness["split_bin"]],
106
- 1 + aggressiveness["value"] / 3,
107
- )
108
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
- mask[:, :, aggressiveness["split_bin"] :],
110
- 1 + aggressiveness["value"],
111
- )
112
-
113
- return mask * mix
114
-
115
- def predict(self, x_mag, aggressiveness=None):
116
- h = self.forward(x_mag, aggressiveness)
117
-
118
- if self.offset > 0:
119
- h = h[:, :, :, self.offset : -self.offset]
120
- assert h.size()[3] > 0
121
-
122
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_33966KB.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_33966KB as layers
6
-
7
-
8
- class BaseASPPNet(nn.Module):
9
- def __init__(self, nin, ch, dilations=(4, 8, 16, 32)):
10
- super(BaseASPPNet, self).__init__()
11
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
-
16
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
-
18
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
-
23
- def __call__(self, x):
24
- h, e1 = self.enc1(x)
25
- h, e2 = self.enc2(h)
26
- h, e3 = self.enc3(h)
27
- h, e4 = self.enc4(h)
28
-
29
- h = self.aspp(h)
30
-
31
- h = self.dec4(h, e4)
32
- h = self.dec3(h, e3)
33
- h = self.dec2(h, e2)
34
- h = self.dec1(h, e1)
35
-
36
- return h
37
-
38
-
39
- class CascadedASPPNet(nn.Module):
40
- def __init__(self, n_fft):
41
- super(CascadedASPPNet, self).__init__()
42
- self.stg1_low_band_net = BaseASPPNet(2, 16)
43
- self.stg1_high_band_net = BaseASPPNet(2, 16)
44
-
45
- self.stg2_bridge = layers.Conv2DBNActiv(18, 8, 1, 1, 0)
46
- self.stg2_full_band_net = BaseASPPNet(8, 16)
47
-
48
- self.stg3_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
49
- self.stg3_full_band_net = BaseASPPNet(16, 32)
50
-
51
- self.out = nn.Conv2d(32, 2, 1, bias=False)
52
- self.aux1_out = nn.Conv2d(16, 2, 1, bias=False)
53
- self.aux2_out = nn.Conv2d(16, 2, 1, bias=False)
54
-
55
- self.max_bin = n_fft // 2
56
- self.output_bin = n_fft // 2 + 1
57
-
58
- self.offset = 128
59
-
60
- def forward(self, x, aggressiveness=None):
61
- mix = x.detach()
62
- x = x.clone()
63
-
64
- x = x[:, :, : self.max_bin]
65
-
66
- bandw = x.size()[2] // 2
67
- aux1 = torch.cat(
68
- [
69
- self.stg1_low_band_net(x[:, :, :bandw]),
70
- self.stg1_high_band_net(x[:, :, bandw:]),
71
- ],
72
- dim=2,
73
- )
74
-
75
- h = torch.cat([x, aux1], dim=1)
76
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
-
78
- h = torch.cat([x, aux1, aux2], dim=1)
79
- h = self.stg3_full_band_net(self.stg3_bridge(h))
80
-
81
- mask = torch.sigmoid(self.out(h))
82
- mask = F.pad(
83
- input=mask,
84
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
- mode="replicate",
86
- )
87
-
88
- if self.training:
89
- aux1 = torch.sigmoid(self.aux1_out(aux1))
90
- aux1 = F.pad(
91
- input=aux1,
92
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
- mode="replicate",
94
- )
95
- aux2 = torch.sigmoid(self.aux2_out(aux2))
96
- aux2 = F.pad(
97
- input=aux2,
98
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
- mode="replicate",
100
- )
101
- return mask * mix, aux1 * mix, aux2 * mix
102
- else:
103
- if aggressiveness:
104
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
- mask[:, :, : aggressiveness["split_bin"]],
106
- 1 + aggressiveness["value"] / 3,
107
- )
108
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
- mask[:, :, aggressiveness["split_bin"] :],
110
- 1 + aggressiveness["value"],
111
- )
112
-
113
- return mask * mix
114
-
115
- def predict(self, x_mag, aggressiveness=None):
116
- h = self.forward(x_mag, aggressiveness)
117
-
118
- if self.offset > 0:
119
- h = h[:, :, :, self.offset : -self.offset]
120
- assert h.size()[3] > 0
121
-
122
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_537227KB.py DELETED
@@ -1,123 +0,0 @@
1
- import numpy as np
2
- import torch
3
- import torch.nn.functional as F
4
- from torch import nn
5
-
6
- from . import layers_537238KB as layers
7
-
8
-
9
- class BaseASPPNet(nn.Module):
10
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
- super(BaseASPPNet, self).__init__()
12
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
-
17
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
-
19
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
-
24
- def __call__(self, x):
25
- h, e1 = self.enc1(x)
26
- h, e2 = self.enc2(h)
27
- h, e3 = self.enc3(h)
28
- h, e4 = self.enc4(h)
29
-
30
- h = self.aspp(h)
31
-
32
- h = self.dec4(h, e4)
33
- h = self.dec3(h, e3)
34
- h = self.dec2(h, e2)
35
- h = self.dec1(h, e1)
36
-
37
- return h
38
-
39
-
40
- class CascadedASPPNet(nn.Module):
41
- def __init__(self, n_fft):
42
- super(CascadedASPPNet, self).__init__()
43
- self.stg1_low_band_net = BaseASPPNet(2, 64)
44
- self.stg1_high_band_net = BaseASPPNet(2, 64)
45
-
46
- self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47
- self.stg2_full_band_net = BaseASPPNet(32, 64)
48
-
49
- self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50
- self.stg3_full_band_net = BaseASPPNet(64, 128)
51
-
52
- self.out = nn.Conv2d(128, 2, 1, bias=False)
53
- self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54
- self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55
-
56
- self.max_bin = n_fft // 2
57
- self.output_bin = n_fft // 2 + 1
58
-
59
- self.offset = 128
60
-
61
- def forward(self, x, aggressiveness=None):
62
- mix = x.detach()
63
- x = x.clone()
64
-
65
- x = x[:, :, : self.max_bin]
66
-
67
- bandw = x.size()[2] // 2
68
- aux1 = torch.cat(
69
- [
70
- self.stg1_low_band_net(x[:, :, :bandw]),
71
- self.stg1_high_band_net(x[:, :, bandw:]),
72
- ],
73
- dim=2,
74
- )
75
-
76
- h = torch.cat([x, aux1], dim=1)
77
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
-
79
- h = torch.cat([x, aux1, aux2], dim=1)
80
- h = self.stg3_full_band_net(self.stg3_bridge(h))
81
-
82
- mask = torch.sigmoid(self.out(h))
83
- mask = F.pad(
84
- input=mask,
85
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
- mode="replicate",
87
- )
88
-
89
- if self.training:
90
- aux1 = torch.sigmoid(self.aux1_out(aux1))
91
- aux1 = F.pad(
92
- input=aux1,
93
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
- mode="replicate",
95
- )
96
- aux2 = torch.sigmoid(self.aux2_out(aux2))
97
- aux2 = F.pad(
98
- input=aux2,
99
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
- mode="replicate",
101
- )
102
- return mask * mix, aux1 * mix, aux2 * mix
103
- else:
104
- if aggressiveness:
105
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
- mask[:, :, : aggressiveness["split_bin"]],
107
- 1 + aggressiveness["value"] / 3,
108
- )
109
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
- mask[:, :, aggressiveness["split_bin"] :],
111
- 1 + aggressiveness["value"],
112
- )
113
-
114
- return mask * mix
115
-
116
- def predict(self, x_mag, aggressiveness=None):
117
- h = self.forward(x_mag, aggressiveness)
118
-
119
- if self.offset > 0:
120
- h = h[:, :, :, self.offset : -self.offset]
121
- assert h.size()[3] > 0
122
-
123
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_537238KB.py DELETED
@@ -1,123 +0,0 @@
1
- import numpy as np
2
- import torch
3
- import torch.nn.functional as F
4
- from torch import nn
5
-
6
- from . import layers_537238KB as layers
7
-
8
-
9
- class BaseASPPNet(nn.Module):
10
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
11
- super(BaseASPPNet, self).__init__()
12
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
13
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
14
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
15
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
16
-
17
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
18
-
19
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
20
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
21
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
22
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
23
-
24
- def __call__(self, x):
25
- h, e1 = self.enc1(x)
26
- h, e2 = self.enc2(h)
27
- h, e3 = self.enc3(h)
28
- h, e4 = self.enc4(h)
29
-
30
- h = self.aspp(h)
31
-
32
- h = self.dec4(h, e4)
33
- h = self.dec3(h, e3)
34
- h = self.dec2(h, e2)
35
- h = self.dec1(h, e1)
36
-
37
- return h
38
-
39
-
40
- class CascadedASPPNet(nn.Module):
41
- def __init__(self, n_fft):
42
- super(CascadedASPPNet, self).__init__()
43
- self.stg1_low_band_net = BaseASPPNet(2, 64)
44
- self.stg1_high_band_net = BaseASPPNet(2, 64)
45
-
46
- self.stg2_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
47
- self.stg2_full_band_net = BaseASPPNet(32, 64)
48
-
49
- self.stg3_bridge = layers.Conv2DBNActiv(130, 64, 1, 1, 0)
50
- self.stg3_full_band_net = BaseASPPNet(64, 128)
51
-
52
- self.out = nn.Conv2d(128, 2, 1, bias=False)
53
- self.aux1_out = nn.Conv2d(64, 2, 1, bias=False)
54
- self.aux2_out = nn.Conv2d(64, 2, 1, bias=False)
55
-
56
- self.max_bin = n_fft // 2
57
- self.output_bin = n_fft // 2 + 1
58
-
59
- self.offset = 128
60
-
61
- def forward(self, x, aggressiveness=None):
62
- mix = x.detach()
63
- x = x.clone()
64
-
65
- x = x[:, :, : self.max_bin]
66
-
67
- bandw = x.size()[2] // 2
68
- aux1 = torch.cat(
69
- [
70
- self.stg1_low_band_net(x[:, :, :bandw]),
71
- self.stg1_high_band_net(x[:, :, bandw:]),
72
- ],
73
- dim=2,
74
- )
75
-
76
- h = torch.cat([x, aux1], dim=1)
77
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
78
-
79
- h = torch.cat([x, aux1, aux2], dim=1)
80
- h = self.stg3_full_band_net(self.stg3_bridge(h))
81
-
82
- mask = torch.sigmoid(self.out(h))
83
- mask = F.pad(
84
- input=mask,
85
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
86
- mode="replicate",
87
- )
88
-
89
- if self.training:
90
- aux1 = torch.sigmoid(self.aux1_out(aux1))
91
- aux1 = F.pad(
92
- input=aux1,
93
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
94
- mode="replicate",
95
- )
96
- aux2 = torch.sigmoid(self.aux2_out(aux2))
97
- aux2 = F.pad(
98
- input=aux2,
99
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
100
- mode="replicate",
101
- )
102
- return mask * mix, aux1 * mix, aux2 * mix
103
- else:
104
- if aggressiveness:
105
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
106
- mask[:, :, : aggressiveness["split_bin"]],
107
- 1 + aggressiveness["value"] / 3,
108
- )
109
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
110
- mask[:, :, aggressiveness["split_bin"] :],
111
- 1 + aggressiveness["value"],
112
- )
113
-
114
- return mask * mix
115
-
116
- def predict(self, x_mag, aggressiveness=None):
117
- h = self.forward(x_mag, aggressiveness)
118
-
119
- if self.offset > 0:
120
- h = h[:, :, :, self.offset : -self.offset]
121
- assert h.size()[3] > 0
122
-
123
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_61968KB.py DELETED
@@ -1,122 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_123821KB as layers
6
-
7
-
8
- class BaseASPPNet(nn.Module):
9
- def __init__(self, nin, ch, dilations=(4, 8, 16)):
10
- super(BaseASPPNet, self).__init__()
11
- self.enc1 = layers.Encoder(nin, ch, 3, 2, 1)
12
- self.enc2 = layers.Encoder(ch, ch * 2, 3, 2, 1)
13
- self.enc3 = layers.Encoder(ch * 2, ch * 4, 3, 2, 1)
14
- self.enc4 = layers.Encoder(ch * 4, ch * 8, 3, 2, 1)
15
-
16
- self.aspp = layers.ASPPModule(ch * 8, ch * 16, dilations)
17
-
18
- self.dec4 = layers.Decoder(ch * (8 + 16), ch * 8, 3, 1, 1)
19
- self.dec3 = layers.Decoder(ch * (4 + 8), ch * 4, 3, 1, 1)
20
- self.dec2 = layers.Decoder(ch * (2 + 4), ch * 2, 3, 1, 1)
21
- self.dec1 = layers.Decoder(ch * (1 + 2), ch, 3, 1, 1)
22
-
23
- def __call__(self, x):
24
- h, e1 = self.enc1(x)
25
- h, e2 = self.enc2(h)
26
- h, e3 = self.enc3(h)
27
- h, e4 = self.enc4(h)
28
-
29
- h = self.aspp(h)
30
-
31
- h = self.dec4(h, e4)
32
- h = self.dec3(h, e3)
33
- h = self.dec2(h, e2)
34
- h = self.dec1(h, e1)
35
-
36
- return h
37
-
38
-
39
- class CascadedASPPNet(nn.Module):
40
- def __init__(self, n_fft):
41
- super(CascadedASPPNet, self).__init__()
42
- self.stg1_low_band_net = BaseASPPNet(2, 32)
43
- self.stg1_high_band_net = BaseASPPNet(2, 32)
44
-
45
- self.stg2_bridge = layers.Conv2DBNActiv(34, 16, 1, 1, 0)
46
- self.stg2_full_band_net = BaseASPPNet(16, 32)
47
-
48
- self.stg3_bridge = layers.Conv2DBNActiv(66, 32, 1, 1, 0)
49
- self.stg3_full_band_net = BaseASPPNet(32, 64)
50
-
51
- self.out = nn.Conv2d(64, 2, 1, bias=False)
52
- self.aux1_out = nn.Conv2d(32, 2, 1, bias=False)
53
- self.aux2_out = nn.Conv2d(32, 2, 1, bias=False)
54
-
55
- self.max_bin = n_fft // 2
56
- self.output_bin = n_fft // 2 + 1
57
-
58
- self.offset = 128
59
-
60
- def forward(self, x, aggressiveness=None):
61
- mix = x.detach()
62
- x = x.clone()
63
-
64
- x = x[:, :, : self.max_bin]
65
-
66
- bandw = x.size()[2] // 2
67
- aux1 = torch.cat(
68
- [
69
- self.stg1_low_band_net(x[:, :, :bandw]),
70
- self.stg1_high_band_net(x[:, :, bandw:]),
71
- ],
72
- dim=2,
73
- )
74
-
75
- h = torch.cat([x, aux1], dim=1)
76
- aux2 = self.stg2_full_band_net(self.stg2_bridge(h))
77
-
78
- h = torch.cat([x, aux1, aux2], dim=1)
79
- h = self.stg3_full_band_net(self.stg3_bridge(h))
80
-
81
- mask = torch.sigmoid(self.out(h))
82
- mask = F.pad(
83
- input=mask,
84
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
85
- mode="replicate",
86
- )
87
-
88
- if self.training:
89
- aux1 = torch.sigmoid(self.aux1_out(aux1))
90
- aux1 = F.pad(
91
- input=aux1,
92
- pad=(0, 0, 0, self.output_bin - aux1.size()[2]),
93
- mode="replicate",
94
- )
95
- aux2 = torch.sigmoid(self.aux2_out(aux2))
96
- aux2 = F.pad(
97
- input=aux2,
98
- pad=(0, 0, 0, self.output_bin - aux2.size()[2]),
99
- mode="replicate",
100
- )
101
- return mask * mix, aux1 * mix, aux2 * mix
102
- else:
103
- if aggressiveness:
104
- mask[:, :, : aggressiveness["split_bin"]] = torch.pow(
105
- mask[:, :, : aggressiveness["split_bin"]],
106
- 1 + aggressiveness["value"] / 3,
107
- )
108
- mask[:, :, aggressiveness["split_bin"] :] = torch.pow(
109
- mask[:, :, aggressiveness["split_bin"] :],
110
- 1 + aggressiveness["value"],
111
- )
112
-
113
- return mask * mix
114
-
115
- def predict(self, x_mag, aggressiveness=None):
116
- h = self.forward(x_mag, aggressiveness)
117
-
118
- if self.offset > 0:
119
- h = h[:, :, :, self.offset : -self.offset]
120
- assert h.size()[3] > 0
121
-
122
- return h
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/nets_new.py DELETED
@@ -1,133 +0,0 @@
1
- import torch
2
- import torch.nn.functional as F
3
- from torch import nn
4
-
5
- from . import layers_new
6
-
7
-
8
- class BaseNet(nn.Module):
9
- def __init__(
10
- self, nin, nout, nin_lstm, nout_lstm, dilations=((4, 2), (8, 4), (12, 6))
11
- ):
12
- super(BaseNet, self).__init__()
13
- self.enc1 = layers_new.Conv2DBNActiv(nin, nout, 3, 1, 1)
14
- self.enc2 = layers_new.Encoder(nout, nout * 2, 3, 2, 1)
15
- self.enc3 = layers_new.Encoder(nout * 2, nout * 4, 3, 2, 1)
16
- self.enc4 = layers_new.Encoder(nout * 4, nout * 6, 3, 2, 1)
17
- self.enc5 = layers_new.Encoder(nout * 6, nout * 8, 3, 2, 1)
18
-
19
- self.aspp = layers_new.ASPPModule(nout * 8, nout * 8, dilations, dropout=True)
20
-
21
- self.dec4 = layers_new.Decoder(nout * (6 + 8), nout * 6, 3, 1, 1)
22
- self.dec3 = layers_new.Decoder(nout * (4 + 6), nout * 4, 3, 1, 1)
23
- self.dec2 = layers_new.Decoder(nout * (2 + 4), nout * 2, 3, 1, 1)
24
- self.lstm_dec2 = layers_new.LSTMModule(nout * 2, nin_lstm, nout_lstm)
25
- self.dec1 = layers_new.Decoder(nout * (1 + 2) + 1, nout * 1, 3, 1, 1)
26
-
27
- def __call__(self, x):
28
- e1 = self.enc1(x)
29
- e2 = self.enc2(e1)
30
- e3 = self.enc3(e2)
31
- e4 = self.enc4(e3)
32
- e5 = self.enc5(e4)
33
-
34
- h = self.aspp(e5)
35
-
36
- h = self.dec4(h, e4)
37
- h = self.dec3(h, e3)
38
- h = self.dec2(h, e2)
39
- h = torch.cat([h, self.lstm_dec2(h)], dim=1)
40
- h = self.dec1(h, e1)
41
-
42
- return h
43
-
44
-
45
- class CascadedNet(nn.Module):
46
- def __init__(self, n_fft, nout=32, nout_lstm=128):
47
- super(CascadedNet, self).__init__()
48
-
49
- self.max_bin = n_fft // 2
50
- self.output_bin = n_fft // 2 + 1
51
- self.nin_lstm = self.max_bin // 2
52
- self.offset = 64
53
-
54
- self.stg1_low_band_net = nn.Sequential(
55
- BaseNet(2, nout // 2, self.nin_lstm // 2, nout_lstm),
56
- layers_new.Conv2DBNActiv(nout // 2, nout // 4, 1, 1, 0),
57
- )
58
-
59
- self.stg1_high_band_net = BaseNet(
60
- 2, nout // 4, self.nin_lstm // 2, nout_lstm // 2
61
- )
62
-
63
- self.stg2_low_band_net = nn.Sequential(
64
- BaseNet(nout // 4 + 2, nout, self.nin_lstm // 2, nout_lstm),
65
- layers_new.Conv2DBNActiv(nout, nout // 2, 1, 1, 0),
66
- )
67
- self.stg2_high_band_net = BaseNet(
68
- nout // 4 + 2, nout // 2, self.nin_lstm // 2, nout_lstm // 2
69
- )
70
-
71
- self.stg3_full_band_net = BaseNet(
72
- 3 * nout // 4 + 2, nout, self.nin_lstm, nout_lstm
73
- )
74
-
75
- self.out = nn.Conv2d(nout, 2, 1, bias=False)
76
- self.aux_out = nn.Conv2d(3 * nout // 4, 2, 1, bias=False)
77
-
78
- def forward(self, x):
79
- x = x[:, :, : self.max_bin]
80
-
81
- bandw = x.size()[2] // 2
82
- l1_in = x[:, :, :bandw]
83
- h1_in = x[:, :, bandw:]
84
- l1 = self.stg1_low_band_net(l1_in)
85
- h1 = self.stg1_high_band_net(h1_in)
86
- aux1 = torch.cat([l1, h1], dim=2)
87
-
88
- l2_in = torch.cat([l1_in, l1], dim=1)
89
- h2_in = torch.cat([h1_in, h1], dim=1)
90
- l2 = self.stg2_low_band_net(l2_in)
91
- h2 = self.stg2_high_band_net(h2_in)
92
- aux2 = torch.cat([l2, h2], dim=2)
93
-
94
- f3_in = torch.cat([x, aux1, aux2], dim=1)
95
- f3 = self.stg3_full_band_net(f3_in)
96
-
97
- mask = torch.sigmoid(self.out(f3))
98
- mask = F.pad(
99
- input=mask,
100
- pad=(0, 0, 0, self.output_bin - mask.size()[2]),
101
- mode="replicate",
102
- )
103
-
104
- if self.training:
105
- aux = torch.cat([aux1, aux2], dim=1)
106
- aux = torch.sigmoid(self.aux_out(aux))
107
- aux = F.pad(
108
- input=aux,
109
- pad=(0, 0, 0, self.output_bin - aux.size()[2]),
110
- mode="replicate",
111
- )
112
- return mask, aux
113
- else:
114
- return mask
115
-
116
- def predict_mask(self, x):
117
- mask = self.forward(x)
118
-
119
- if self.offset > 0:
120
- mask = mask[:, :, :, self.offset : -self.offset]
121
- assert mask.size()[3] > 0
122
-
123
- return mask
124
-
125
- def predict(self, x, aggressiveness=None):
126
- mask = self.forward(x)
127
- pred_mag = x * mask
128
-
129
- if self.offset > 0:
130
- pred_mag = pred_mag[:, :, :, self.offset : -self.offset]
131
- assert pred_mag.size()[3] > 0
132
-
133
- return pred_mag
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/lib_v5/spec_utils.py DELETED
@@ -1,672 +0,0 @@
1
- import hashlib
2
- import json
3
- import math
4
- import os
5
-
6
- import librosa
7
- import numpy as np
8
- import soundfile as sf
9
- from tqdm import tqdm
10
-
11
-
12
- def crop_center(h1, h2):
13
- h1_shape = h1.size()
14
- h2_shape = h2.size()
15
-
16
- if h1_shape[3] == h2_shape[3]:
17
- return h1
18
- elif h1_shape[3] < h2_shape[3]:
19
- raise ValueError("h1_shape[3] must be greater than h2_shape[3]")
20
-
21
- # s_freq = (h2_shape[2] - h1_shape[2]) // 2
22
- # e_freq = s_freq + h1_shape[2]
23
- s_time = (h1_shape[3] - h2_shape[3]) // 2
24
- e_time = s_time + h2_shape[3]
25
- h1 = h1[:, :, :, s_time:e_time]
26
-
27
- return h1
28
-
29
-
30
- def wave_to_spectrogram(
31
- wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
32
- ):
33
- if reverse:
34
- wave_left = np.flip(np.asfortranarray(wave[0]))
35
- wave_right = np.flip(np.asfortranarray(wave[1]))
36
- elif mid_side:
37
- wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
38
- wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
39
- elif mid_side_b2:
40
- wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
41
- wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
42
- else:
43
- wave_left = np.asfortranarray(wave[0])
44
- wave_right = np.asfortranarray(wave[1])
45
-
46
- spec_left = librosa.stft(wave_left, n_fft, hop_length=hop_length)
47
- spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
48
-
49
- spec = np.asfortranarray([spec_left, spec_right])
50
-
51
- return spec
52
-
53
-
54
- def wave_to_spectrogram_mt(
55
- wave, hop_length, n_fft, mid_side=False, mid_side_b2=False, reverse=False
56
- ):
57
- import threading
58
-
59
- if reverse:
60
- wave_left = np.flip(np.asfortranarray(wave[0]))
61
- wave_right = np.flip(np.asfortranarray(wave[1]))
62
- elif mid_side:
63
- wave_left = np.asfortranarray(np.add(wave[0], wave[1]) / 2)
64
- wave_right = np.asfortranarray(np.subtract(wave[0], wave[1]))
65
- elif mid_side_b2:
66
- wave_left = np.asfortranarray(np.add(wave[1], wave[0] * 0.5))
67
- wave_right = np.asfortranarray(np.subtract(wave[0], wave[1] * 0.5))
68
- else:
69
- wave_left = np.asfortranarray(wave[0])
70
- wave_right = np.asfortranarray(wave[1])
71
-
72
- def run_thread(**kwargs):
73
- global spec_left
74
- spec_left = librosa.stft(**kwargs)
75
-
76
- thread = threading.Thread(
77
- target=run_thread,
78
- kwargs={"y": wave_left, "n_fft": n_fft, "hop_length": hop_length},
79
- )
80
- thread.start()
81
- spec_right = librosa.stft(wave_right, n_fft, hop_length=hop_length)
82
- thread.join()
83
-
84
- spec = np.asfortranarray([spec_left, spec_right])
85
-
86
- return spec
87
-
88
-
89
- def combine_spectrograms(specs, mp):
90
- l = min([specs[i].shape[2] for i in specs])
91
- spec_c = np.zeros(shape=(2, mp.param["bins"] + 1, l), dtype=np.complex64)
92
- offset = 0
93
- bands_n = len(mp.param["band"])
94
-
95
- for d in range(1, bands_n + 1):
96
- h = mp.param["band"][d]["crop_stop"] - mp.param["band"][d]["crop_start"]
97
- spec_c[:, offset : offset + h, :l] = specs[d][
98
- :, mp.param["band"][d]["crop_start"] : mp.param["band"][d]["crop_stop"], :l
99
- ]
100
- offset += h
101
-
102
- if offset > mp.param["bins"]:
103
- raise ValueError("Too much bins")
104
-
105
- # lowpass fiter
106
- if (
107
- mp.param["pre_filter_start"] > 0
108
- ): # and mp.param['band'][bands_n]['res_type'] in ['scipy', 'polyphase']:
109
- if bands_n == 1:
110
- spec_c = fft_lp_filter(
111
- spec_c, mp.param["pre_filter_start"], mp.param["pre_filter_stop"]
112
- )
113
- else:
114
- gp = 1
115
- for b in range(
116
- mp.param["pre_filter_start"] + 1, mp.param["pre_filter_stop"]
117
- ):
118
- g = math.pow(
119
- 10, -(b - mp.param["pre_filter_start"]) * (3.5 - gp) / 20.0
120
- )
121
- gp = g
122
- spec_c[:, b, :] *= g
123
-
124
- return np.asfortranarray(spec_c)
125
-
126
-
127
- def spectrogram_to_image(spec, mode="magnitude"):
128
- if mode == "magnitude":
129
- if np.iscomplexobj(spec):
130
- y = np.abs(spec)
131
- else:
132
- y = spec
133
- y = np.log10(y**2 + 1e-8)
134
- elif mode == "phase":
135
- if np.iscomplexobj(spec):
136
- y = np.angle(spec)
137
- else:
138
- y = spec
139
-
140
- y -= y.min()
141
- y *= 255 / y.max()
142
- img = np.uint8(y)
143
-
144
- if y.ndim == 3:
145
- img = img.transpose(1, 2, 0)
146
- img = np.concatenate([np.max(img, axis=2, keepdims=True), img], axis=2)
147
-
148
- return img
149
-
150
-
151
- def reduce_vocal_aggressively(X, y, softmask):
152
- v = X - y
153
- y_mag_tmp = np.abs(y)
154
- v_mag_tmp = np.abs(v)
155
-
156
- v_mask = v_mag_tmp > y_mag_tmp
157
- y_mag = np.clip(y_mag_tmp - v_mag_tmp * v_mask * softmask, 0, np.inf)
158
-
159
- return y_mag * np.exp(1.0j * np.angle(y))
160
-
161
-
162
- def mask_silence(mag, ref, thres=0.2, min_range=64, fade_size=32):
163
- if min_range < fade_size * 2:
164
- raise ValueError("min_range must be >= fade_area * 2")
165
-
166
- mag = mag.copy()
167
-
168
- idx = np.where(ref.mean(axis=(0, 1)) < thres)[0]
169
- starts = np.insert(idx[np.where(np.diff(idx) != 1)[0] + 1], 0, idx[0])
170
- ends = np.append(idx[np.where(np.diff(idx) != 1)[0]], idx[-1])
171
- uninformative = np.where(ends - starts > min_range)[0]
172
- if len(uninformative) > 0:
173
- starts = starts[uninformative]
174
- ends = ends[uninformative]
175
- old_e = None
176
- for s, e in zip(starts, ends):
177
- if old_e is not None and s - old_e < fade_size:
178
- s = old_e - fade_size * 2
179
-
180
- if s != 0:
181
- weight = np.linspace(0, 1, fade_size)
182
- mag[:, :, s : s + fade_size] += weight * ref[:, :, s : s + fade_size]
183
- else:
184
- s -= fade_size
185
-
186
- if e != mag.shape[2]:
187
- weight = np.linspace(1, 0, fade_size)
188
- mag[:, :, e - fade_size : e] += weight * ref[:, :, e - fade_size : e]
189
- else:
190
- e += fade_size
191
-
192
- mag[:, :, s + fade_size : e - fade_size] += ref[
193
- :, :, s + fade_size : e - fade_size
194
- ]
195
- old_e = e
196
-
197
- return mag
198
-
199
-
200
- def align_wave_head_and_tail(a, b):
201
- l = min([a[0].size, b[0].size])
202
-
203
- return a[:l, :l], b[:l, :l]
204
-
205
-
206
- def cache_or_load(mix_path, inst_path, mp):
207
- mix_basename = os.path.splitext(os.path.basename(mix_path))[0]
208
- inst_basename = os.path.splitext(os.path.basename(inst_path))[0]
209
-
210
- cache_dir = "mph{}".format(
211
- hashlib.sha1(json.dumps(mp.param, sort_keys=True).encode("utf-8")).hexdigest()
212
- )
213
- mix_cache_dir = os.path.join("cache", cache_dir)
214
- inst_cache_dir = os.path.join("cache", cache_dir)
215
-
216
- os.makedirs(mix_cache_dir, exist_ok=True)
217
- os.makedirs(inst_cache_dir, exist_ok=True)
218
-
219
- mix_cache_path = os.path.join(mix_cache_dir, mix_basename + ".npy")
220
- inst_cache_path = os.path.join(inst_cache_dir, inst_basename + ".npy")
221
-
222
- if os.path.exists(mix_cache_path) and os.path.exists(inst_cache_path):
223
- X_spec_m = np.load(mix_cache_path)
224
- y_spec_m = np.load(inst_cache_path)
225
- else:
226
- X_wave, y_wave, X_spec_s, y_spec_s = {}, {}, {}, {}
227
-
228
- for d in range(len(mp.param["band"]), 0, -1):
229
- bp = mp.param["band"][d]
230
-
231
- if d == len(mp.param["band"]): # high-end band
232
- X_wave[d], _ = librosa.load(
233
- mix_path, bp["sr"], False, dtype=np.float32, res_type=bp["res_type"]
234
- )
235
- y_wave[d], _ = librosa.load(
236
- inst_path,
237
- bp["sr"],
238
- False,
239
- dtype=np.float32,
240
- res_type=bp["res_type"],
241
- )
242
- else: # lower bands
243
- X_wave[d] = librosa.resample(
244
- X_wave[d + 1],
245
- mp.param["band"][d + 1]["sr"],
246
- bp["sr"],
247
- res_type=bp["res_type"],
248
- )
249
- y_wave[d] = librosa.resample(
250
- y_wave[d + 1],
251
- mp.param["band"][d + 1]["sr"],
252
- bp["sr"],
253
- res_type=bp["res_type"],
254
- )
255
-
256
- X_wave[d], y_wave[d] = align_wave_head_and_tail(X_wave[d], y_wave[d])
257
-
258
- X_spec_s[d] = wave_to_spectrogram(
259
- X_wave[d],
260
- bp["hl"],
261
- bp["n_fft"],
262
- mp.param["mid_side"],
263
- mp.param["mid_side_b2"],
264
- mp.param["reverse"],
265
- )
266
- y_spec_s[d] = wave_to_spectrogram(
267
- y_wave[d],
268
- bp["hl"],
269
- bp["n_fft"],
270
- mp.param["mid_side"],
271
- mp.param["mid_side_b2"],
272
- mp.param["reverse"],
273
- )
274
-
275
- del X_wave, y_wave
276
-
277
- X_spec_m = combine_spectrograms(X_spec_s, mp)
278
- y_spec_m = combine_spectrograms(y_spec_s, mp)
279
-
280
- if X_spec_m.shape != y_spec_m.shape:
281
- raise ValueError("The combined spectrograms are different: " + mix_path)
282
-
283
- _, ext = os.path.splitext(mix_path)
284
-
285
- np.save(mix_cache_path, X_spec_m)
286
- np.save(inst_cache_path, y_spec_m)
287
-
288
- return X_spec_m, y_spec_m
289
-
290
-
291
- def spectrogram_to_wave(spec, hop_length, mid_side, mid_side_b2, reverse):
292
- spec_left = np.asfortranarray(spec[0])
293
- spec_right = np.asfortranarray(spec[1])
294
-
295
- wave_left = librosa.istft(spec_left, hop_length=hop_length)
296
- wave_right = librosa.istft(spec_right, hop_length=hop_length)
297
-
298
- if reverse:
299
- return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
300
- elif mid_side:
301
- return np.asfortranarray(
302
- [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
303
- )
304
- elif mid_side_b2:
305
- return np.asfortranarray(
306
- [
307
- np.add(wave_right / 1.25, 0.4 * wave_left),
308
- np.subtract(wave_left / 1.25, 0.4 * wave_right),
309
- ]
310
- )
311
- else:
312
- return np.asfortranarray([wave_left, wave_right])
313
-
314
-
315
- def spectrogram_to_wave_mt(spec, hop_length, mid_side, reverse, mid_side_b2):
316
- import threading
317
-
318
- spec_left = np.asfortranarray(spec[0])
319
- spec_right = np.asfortranarray(spec[1])
320
-
321
- def run_thread(**kwargs):
322
- global wave_left
323
- wave_left = librosa.istft(**kwargs)
324
-
325
- thread = threading.Thread(
326
- target=run_thread, kwargs={"stft_matrix": spec_left, "hop_length": hop_length}
327
- )
328
- thread.start()
329
- wave_right = librosa.istft(spec_right, hop_length=hop_length)
330
- thread.join()
331
-
332
- if reverse:
333
- return np.asfortranarray([np.flip(wave_left), np.flip(wave_right)])
334
- elif mid_side:
335
- return np.asfortranarray(
336
- [np.add(wave_left, wave_right / 2), np.subtract(wave_left, wave_right / 2)]
337
- )
338
- elif mid_side_b2:
339
- return np.asfortranarray(
340
- [
341
- np.add(wave_right / 1.25, 0.4 * wave_left),
342
- np.subtract(wave_left / 1.25, 0.4 * wave_right),
343
- ]
344
- )
345
- else:
346
- return np.asfortranarray([wave_left, wave_right])
347
-
348
-
349
- def cmb_spectrogram_to_wave(spec_m, mp, extra_bins_h=None, extra_bins=None):
350
- wave_band = {}
351
- bands_n = len(mp.param["band"])
352
- offset = 0
353
-
354
- for d in range(1, bands_n + 1):
355
- bp = mp.param["band"][d]
356
- spec_s = np.ndarray(
357
- shape=(2, bp["n_fft"] // 2 + 1, spec_m.shape[2]), dtype=complex
358
- )
359
- h = bp["crop_stop"] - bp["crop_start"]
360
- spec_s[:, bp["crop_start"] : bp["crop_stop"], :] = spec_m[
361
- :, offset : offset + h, :
362
- ]
363
-
364
- offset += h
365
- if d == bands_n: # higher
366
- if extra_bins_h: # if --high_end_process bypass
367
- max_bin = bp["n_fft"] // 2
368
- spec_s[:, max_bin - extra_bins_h : max_bin, :] = extra_bins[
369
- :, :extra_bins_h, :
370
- ]
371
- if bp["hpf_start"] > 0:
372
- spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
373
- if bands_n == 1:
374
- wave = spectrogram_to_wave(
375
- spec_s,
376
- bp["hl"],
377
- mp.param["mid_side"],
378
- mp.param["mid_side_b2"],
379
- mp.param["reverse"],
380
- )
381
- else:
382
- wave = np.add(
383
- wave,
384
- spectrogram_to_wave(
385
- spec_s,
386
- bp["hl"],
387
- mp.param["mid_side"],
388
- mp.param["mid_side_b2"],
389
- mp.param["reverse"],
390
- ),
391
- )
392
- else:
393
- sr = mp.param["band"][d + 1]["sr"]
394
- if d == 1: # lower
395
- spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
396
- wave = librosa.resample(
397
- spectrogram_to_wave(
398
- spec_s,
399
- bp["hl"],
400
- mp.param["mid_side"],
401
- mp.param["mid_side_b2"],
402
- mp.param["reverse"],
403
- ),
404
- bp["sr"],
405
- sr,
406
- res_type="sinc_fastest",
407
- )
408
- else: # mid
409
- spec_s = fft_hp_filter(spec_s, bp["hpf_start"], bp["hpf_stop"] - 1)
410
- spec_s = fft_lp_filter(spec_s, bp["lpf_start"], bp["lpf_stop"])
411
- wave2 = np.add(
412
- wave,
413
- spectrogram_to_wave(
414
- spec_s,
415
- bp["hl"],
416
- mp.param["mid_side"],
417
- mp.param["mid_side_b2"],
418
- mp.param["reverse"],
419
- ),
420
- )
421
- # wave = librosa.core.resample(wave2, bp['sr'], sr, res_type="sinc_fastest")
422
- wave = librosa.core.resample(wave2, bp["sr"], sr, res_type="scipy")
423
-
424
- return wave.T
425
-
426
-
427
- def fft_lp_filter(spec, bin_start, bin_stop):
428
- g = 1.0
429
- for b in range(bin_start, bin_stop):
430
- g -= 1 / (bin_stop - bin_start)
431
- spec[:, b, :] = g * spec[:, b, :]
432
-
433
- spec[:, bin_stop:, :] *= 0
434
-
435
- return spec
436
-
437
-
438
- def fft_hp_filter(spec, bin_start, bin_stop):
439
- g = 1.0
440
- for b in range(bin_start, bin_stop, -1):
441
- g -= 1 / (bin_start - bin_stop)
442
- spec[:, b, :] = g * spec[:, b, :]
443
-
444
- spec[:, 0 : bin_stop + 1, :] *= 0
445
-
446
- return spec
447
-
448
-
449
- def mirroring(a, spec_m, input_high_end, mp):
450
- if "mirroring" == a:
451
- mirror = np.flip(
452
- np.abs(
453
- spec_m[
454
- :,
455
- mp.param["pre_filter_start"]
456
- - 10
457
- - input_high_end.shape[1] : mp.param["pre_filter_start"]
458
- - 10,
459
- :,
460
- ]
461
- ),
462
- 1,
463
- )
464
- mirror = mirror * np.exp(1.0j * np.angle(input_high_end))
465
-
466
- return np.where(
467
- np.abs(input_high_end) <= np.abs(mirror), input_high_end, mirror
468
- )
469
-
470
- if "mirroring2" == a:
471
- mirror = np.flip(
472
- np.abs(
473
- spec_m[
474
- :,
475
- mp.param["pre_filter_start"]
476
- - 10
477
- - input_high_end.shape[1] : mp.param["pre_filter_start"]
478
- - 10,
479
- :,
480
- ]
481
- ),
482
- 1,
483
- )
484
- mi = np.multiply(mirror, input_high_end * 1.7)
485
-
486
- return np.where(np.abs(input_high_end) <= np.abs(mi), input_high_end, mi)
487
-
488
-
489
- def ensembling(a, specs):
490
- for i in range(1, len(specs)):
491
- if i == 1:
492
- spec = specs[0]
493
-
494
- ln = min([spec.shape[2], specs[i].shape[2]])
495
- spec = spec[:, :, :ln]
496
- specs[i] = specs[i][:, :, :ln]
497
-
498
- if "min_mag" == a:
499
- spec = np.where(np.abs(specs[i]) <= np.abs(spec), specs[i], spec)
500
- if "max_mag" == a:
501
- spec = np.where(np.abs(specs[i]) >= np.abs(spec), specs[i], spec)
502
-
503
- return spec
504
-
505
-
506
- def stft(wave, nfft, hl):
507
- wave_left = np.asfortranarray(wave[0])
508
- wave_right = np.asfortranarray(wave[1])
509
- spec_left = librosa.stft(wave_left, nfft, hop_length=hl)
510
- spec_right = librosa.stft(wave_right, nfft, hop_length=hl)
511
- spec = np.asfortranarray([spec_left, spec_right])
512
-
513
- return spec
514
-
515
-
516
- def istft(spec, hl):
517
- spec_left = np.asfortranarray(spec[0])
518
- spec_right = np.asfortranarray(spec[1])
519
-
520
- wave_left = librosa.istft(spec_left, hop_length=hl)
521
- wave_right = librosa.istft(spec_right, hop_length=hl)
522
- wave = np.asfortranarray([wave_left, wave_right])
523
-
524
-
525
- if __name__ == "__main__":
526
- import argparse
527
- import sys
528
- import time
529
-
530
- import cv2
531
- from model_param_init import ModelParameters
532
-
533
- p = argparse.ArgumentParser()
534
- p.add_argument(
535
- "--algorithm",
536
- "-a",
537
- type=str,
538
- choices=["invert", "invert_p", "min_mag", "max_mag", "deep", "align"],
539
- default="min_mag",
540
- )
541
- p.add_argument(
542
- "--model_params",
543
- "-m",
544
- type=str,
545
- default=os.path.join("modelparams", "1band_sr44100_hl512.json"),
546
- )
547
- p.add_argument("--output_name", "-o", type=str, default="output")
548
- p.add_argument("--vocals_only", "-v", action="store_true")
549
- p.add_argument("input", nargs="+")
550
- args = p.parse_args()
551
-
552
- start_time = time.time()
553
-
554
- if args.algorithm.startswith("invert") and len(args.input) != 2:
555
- raise ValueError("There should be two input files.")
556
-
557
- if not args.algorithm.startswith("invert") and len(args.input) < 2:
558
- raise ValueError("There must be at least two input files.")
559
-
560
- wave, specs = {}, {}
561
- mp = ModelParameters(args.model_params)
562
-
563
- for i in range(len(args.input)):
564
- spec = {}
565
-
566
- for d in range(len(mp.param["band"]), 0, -1):
567
- bp = mp.param["band"][d]
568
-
569
- if d == len(mp.param["band"]): # high-end band
570
- wave[d], _ = librosa.load(
571
- args.input[i],
572
- bp["sr"],
573
- False,
574
- dtype=np.float32,
575
- res_type=bp["res_type"],
576
- )
577
-
578
- if len(wave[d].shape) == 1: # mono to stereo
579
- wave[d] = np.array([wave[d], wave[d]])
580
- else: # lower bands
581
- wave[d] = librosa.resample(
582
- wave[d + 1],
583
- mp.param["band"][d + 1]["sr"],
584
- bp["sr"],
585
- res_type=bp["res_type"],
586
- )
587
-
588
- spec[d] = wave_to_spectrogram(
589
- wave[d],
590
- bp["hl"],
591
- bp["n_fft"],
592
- mp.param["mid_side"],
593
- mp.param["mid_side_b2"],
594
- mp.param["reverse"],
595
- )
596
-
597
- specs[i] = combine_spectrograms(spec, mp)
598
-
599
- del wave
600
-
601
- if args.algorithm == "deep":
602
- d_spec = np.where(np.abs(specs[0]) <= np.abs(spec[1]), specs[0], spec[1])
603
- v_spec = d_spec - specs[1]
604
- sf.write(
605
- os.path.join("{}.wav".format(args.output_name)),
606
- cmb_spectrogram_to_wave(v_spec, mp),
607
- mp.param["sr"],
608
- )
609
-
610
- if args.algorithm.startswith("invert"):
611
- ln = min([specs[0].shape[2], specs[1].shape[2]])
612
- specs[0] = specs[0][:, :, :ln]
613
- specs[1] = specs[1][:, :, :ln]
614
-
615
- if "invert_p" == args.algorithm:
616
- X_mag = np.abs(specs[0])
617
- y_mag = np.abs(specs[1])
618
- max_mag = np.where(X_mag >= y_mag, X_mag, y_mag)
619
- v_spec = specs[1] - max_mag * np.exp(1.0j * np.angle(specs[0]))
620
- else:
621
- specs[1] = reduce_vocal_aggressively(specs[0], specs[1], 0.2)
622
- v_spec = specs[0] - specs[1]
623
-
624
- if not args.vocals_only:
625
- X_mag = np.abs(specs[0])
626
- y_mag = np.abs(specs[1])
627
- v_mag = np.abs(v_spec)
628
-
629
- X_image = spectrogram_to_image(X_mag)
630
- y_image = spectrogram_to_image(y_mag)
631
- v_image = spectrogram_to_image(v_mag)
632
-
633
- cv2.imwrite("{}_X.png".format(args.output_name), X_image)
634
- cv2.imwrite("{}_y.png".format(args.output_name), y_image)
635
- cv2.imwrite("{}_v.png".format(args.output_name), v_image)
636
-
637
- sf.write(
638
- "{}_X.wav".format(args.output_name),
639
- cmb_spectrogram_to_wave(specs[0], mp),
640
- mp.param["sr"],
641
- )
642
- sf.write(
643
- "{}_y.wav".format(args.output_name),
644
- cmb_spectrogram_to_wave(specs[1], mp),
645
- mp.param["sr"],
646
- )
647
-
648
- sf.write(
649
- "{}_v.wav".format(args.output_name),
650
- cmb_spectrogram_to_wave(v_spec, mp),
651
- mp.param["sr"],
652
- )
653
- else:
654
- if not args.algorithm == "deep":
655
- sf.write(
656
- os.path.join("ensembled", "{}.wav".format(args.output_name)),
657
- cmb_spectrogram_to_wave(ensembling(args.algorithm, specs), mp),
658
- mp.param["sr"],
659
- )
660
-
661
- if args.algorithm == "align":
662
- trackalignment = [
663
- {
664
- "file1": '"{}"'.format(args.input[0]),
665
- "file2": '"{}"'.format(args.input[1]),
666
- }
667
- ]
668
-
669
- for i, e in tqdm(enumerate(trackalignment), desc="Performing Alignment..."):
670
- os.system(f"python lib/align_tracks.py {e['file1']} {e['file2']}")
671
-
672
- # print('Total time: {0:.{1}f}s'.format(time.time() - start_time, 1))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/name_params.json DELETED
@@ -1,263 +0,0 @@
1
- {
2
- "equivalent" : [
3
- {
4
- "model_hash_name" : [
5
- {
6
- "hash_name": "47939caf0cfe52a0e81442b85b971dfd",
7
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
8
- "param_name": "4band_44100"
9
- },
10
- {
11
- "hash_name": "4e4ecb9764c50a8c414fee6e10395bbe",
12
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
13
- "param_name": "4band_v2"
14
- },
15
- {
16
- "hash_name": "ca106edd563e034bde0bdec4bb7a4b36",
17
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
18
- "param_name": "4band_v2"
19
- },
20
- {
21
- "hash_name": "e60a1e84803ce4efc0a6551206cc4b71",
22
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
23
- "param_name": "4band_44100"
24
- },
25
- {
26
- "hash_name": "a82f14e75892e55e994376edbf0c8435",
27
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
28
- "param_name": "4band_44100"
29
- },
30
- {
31
- "hash_name": "6dd9eaa6f0420af9f1d403aaafa4cc06",
32
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
33
- "param_name": "4band_v2_sn"
34
- },
35
- {
36
- "hash_name": "08611fb99bd59eaa79ad27c58d137727",
37
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
38
- "param_name": "4band_v2_sn"
39
- },
40
- {
41
- "hash_name": "5c7bbca45a187e81abbbd351606164e5",
42
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
43
- "param_name": "3band_44100_msb2"
44
- },
45
- {
46
- "hash_name": "d6b2cb685a058a091e5e7098192d3233",
47
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
48
- "param_name": "3band_44100_msb2"
49
- },
50
- {
51
- "hash_name": "c1b9f38170a7c90e96f027992eb7c62b",
52
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
53
- "param_name": "4band_44100"
54
- },
55
- {
56
- "hash_name": "c3448ec923fa0edf3d03a19e633faa53",
57
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
58
- "param_name": "4band_44100"
59
- },
60
- {
61
- "hash_name": "68aa2c8093d0080704b200d140f59e54",
62
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json",
63
- "param_name": "3band_44100"
64
- },
65
- {
66
- "hash_name": "fdc83be5b798e4bd29fe00fe6600e147",
67
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
68
- "param_name": "3band_44100_mid.json"
69
- },
70
- {
71
- "hash_name": "2ce34bc92fd57f55db16b7a4def3d745",
72
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
73
- "param_name": "3band_44100_mid.json"
74
- },
75
- {
76
- "hash_name": "52fdca89576f06cf4340b74a4730ee5f",
77
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
78
- "param_name": "4band_44100.json"
79
- },
80
- {
81
- "hash_name": "41191165b05d38fc77f072fa9e8e8a30",
82
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
83
- "param_name": "4band_44100.json"
84
- },
85
- {
86
- "hash_name": "89e83b511ad474592689e562d5b1f80e",
87
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
88
- "param_name": "2band_32000.json"
89
- },
90
- {
91
- "hash_name": "0b954da81d453b716b114d6d7c95177f",
92
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
93
- "param_name": "2band_32000.json"
94
- }
95
-
96
- ],
97
- "v4 Models": [
98
- {
99
- "hash_name": "6a00461c51c2920fd68937d4609ed6c8",
100
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
101
- "param_name": "1band_sr16000_hl512"
102
- },
103
- {
104
- "hash_name": "0ab504864d20f1bd378fe9c81ef37140",
105
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
106
- "param_name": "1band_sr32000_hl512"
107
- },
108
- {
109
- "hash_name": "7dd21065bf91c10f7fccb57d7d83b07f",
110
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
111
- "param_name": "1band_sr32000_hl512"
112
- },
113
- {
114
- "hash_name": "80ab74d65e515caa3622728d2de07d23",
115
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
116
- "param_name": "1band_sr32000_hl512"
117
- },
118
- {
119
- "hash_name": "edc115e7fc523245062200c00caa847f",
120
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
121
- "param_name": "1band_sr33075_hl384"
122
- },
123
- {
124
- "hash_name": "28063e9f6ab5b341c5f6d3c67f2045b7",
125
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
126
- "param_name": "1band_sr33075_hl384"
127
- },
128
- {
129
- "hash_name": "b58090534c52cbc3e9b5104bad666ef2",
130
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
131
- "param_name": "1band_sr44100_hl512"
132
- },
133
- {
134
- "hash_name": "0cdab9947f1b0928705f518f3c78ea8f",
135
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
136
- "param_name": "1band_sr44100_hl512"
137
- },
138
- {
139
- "hash_name": "ae702fed0238afb5346db8356fe25f13",
140
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
141
- "param_name": "1band_sr44100_hl1024"
142
- }
143
- ]
144
- }
145
- ],
146
- "User Models" : [
147
- {
148
- "1 Band": [
149
- {
150
- "hash_name": "1band_sr16000_hl512",
151
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr16000_hl512.json",
152
- "param_name": "1band_sr16000_hl512"
153
- },
154
- {
155
- "hash_name": "1band_sr32000_hl512",
156
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr32000_hl512.json",
157
- "param_name": "1band_sr16000_hl512"
158
- },
159
- {
160
- "hash_name": "1band_sr33075_hl384",
161
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr33075_hl384.json",
162
- "param_name": "1band_sr33075_hl384"
163
- },
164
- {
165
- "hash_name": "1band_sr44100_hl256",
166
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl256.json",
167
- "param_name": "1band_sr44100_hl256"
168
- },
169
- {
170
- "hash_name": "1band_sr44100_hl512",
171
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl512.json",
172
- "param_name": "1band_sr44100_hl512"
173
- },
174
- {
175
- "hash_name": "1band_sr44100_hl1024",
176
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/1band_sr44100_hl1024.json",
177
- "param_name": "1band_sr44100_hl1024"
178
- }
179
- ],
180
- "2 Band": [
181
- {
182
- "hash_name": "2band_44100_lofi",
183
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_44100_lofi.json",
184
- "param_name": "2band_44100_lofi"
185
- },
186
- {
187
- "hash_name": "2band_32000",
188
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_32000.json",
189
- "param_name": "2band_32000"
190
- },
191
- {
192
- "hash_name": "2band_48000",
193
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/2band_48000.json",
194
- "param_name": "2band_48000"
195
- }
196
- ],
197
- "3 Band": [
198
- {
199
- "hash_name": "3band_44100",
200
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100.json",
201
- "param_name": "3band_44100"
202
- },
203
- {
204
- "hash_name": "3band_44100_mid",
205
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_mid.json",
206
- "param_name": "3band_44100_mid"
207
- },
208
- {
209
- "hash_name": "3band_44100_msb2",
210
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/3band_44100_msb2.json",
211
- "param_name": "3band_44100_msb2"
212
- }
213
- ],
214
- "4 Band": [
215
- {
216
- "hash_name": "4band_44100",
217
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100.json",
218
- "param_name": "4band_44100"
219
- },
220
- {
221
- "hash_name": "4band_44100_mid",
222
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_mid.json",
223
- "param_name": "4band_44100_mid"
224
- },
225
- {
226
- "hash_name": "4band_44100_msb",
227
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb.json",
228
- "param_name": "4band_44100_msb"
229
- },
230
- {
231
- "hash_name": "4band_44100_msb2",
232
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_msb2.json",
233
- "param_name": "4band_44100_msb2"
234
- },
235
- {
236
- "hash_name": "4band_44100_reverse",
237
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_reverse.json",
238
- "param_name": "4band_44100_reverse"
239
- },
240
- {
241
- "hash_name": "4band_44100_sw",
242
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_44100_sw.json",
243
- "param_name": "4band_44100_sw"
244
- },
245
- {
246
- "hash_name": "4band_v2",
247
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2.json",
248
- "param_name": "4band_v2"
249
- },
250
- {
251
- "hash_name": "4band_v2_sn",
252
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/4band_v2_sn.json",
253
- "param_name": "4band_v2_sn"
254
- },
255
- {
256
- "hash_name": "tmodelparam",
257
- "model_params": "infer/lib/uvr5_pack/lib_v5/modelparams/tmodelparam.json",
258
- "param_name": "User Model Param Set"
259
- }
260
- ]
261
- }
262
- ]
263
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
infer/lib/uvr5_pack/utils.py DELETED
@@ -1,121 +0,0 @@
1
- import json
2
-
3
- import numpy as np
4
- import torch
5
- from tqdm import tqdm
6
-
7
-
8
- def load_data(file_name: str = "./infer/lib/uvr5_pack/name_params.json") -> dict:
9
- with open(file_name, "r") as f:
10
- data = json.load(f)
11
-
12
- return data
13
-
14
-
15
- def make_padding(width, cropsize, offset):
16
- left = offset
17
- roi_size = cropsize - left * 2
18
- if roi_size == 0:
19
- roi_size = cropsize
20
- right = roi_size - (width % roi_size) + left
21
-
22
- return left, right, roi_size
23
-
24
-
25
- def inference(X_spec, device, model, aggressiveness, data):
26
- """
27
- data : dic configs
28
- """
29
-
30
- def _execute(
31
- X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half=True
32
- ):
33
- model.eval()
34
- with torch.no_grad():
35
- preds = []
36
-
37
- iterations = [n_window]
38
-
39
- total_iterations = sum(iterations)
40
- for i in tqdm(range(n_window)):
41
- start = i * roi_size
42
- X_mag_window = X_mag_pad[
43
- None, :, :, start : start + data["window_size"]
44
- ]
45
- X_mag_window = torch.from_numpy(X_mag_window)
46
- if is_half:
47
- X_mag_window = X_mag_window.half()
48
- X_mag_window = X_mag_window.to(device)
49
-
50
- pred = model.predict(X_mag_window, aggressiveness)
51
-
52
- pred = pred.detach().cpu().numpy()
53
- preds.append(pred[0])
54
-
55
- pred = np.concatenate(preds, axis=2)
56
- return pred
57
-
58
- def preprocess(X_spec):
59
- X_mag = np.abs(X_spec)
60
- X_phase = np.angle(X_spec)
61
-
62
- return X_mag, X_phase
63
-
64
- X_mag, X_phase = preprocess(X_spec)
65
-
66
- coef = X_mag.max()
67
- X_mag_pre = X_mag / coef
68
-
69
- n_frame = X_mag_pre.shape[2]
70
- pad_l, pad_r, roi_size = make_padding(n_frame, data["window_size"], model.offset)
71
- n_window = int(np.ceil(n_frame / roi_size))
72
-
73
- X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
74
-
75
- if list(model.state_dict().values())[0].dtype == torch.float16:
76
- is_half = True
77
- else:
78
- is_half = False
79
- pred = _execute(
80
- X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
81
- )
82
- pred = pred[:, :, :n_frame]
83
-
84
- if data["tta"]:
85
- pad_l += roi_size // 2
86
- pad_r += roi_size // 2
87
- n_window += 1
88
-
89
- X_mag_pad = np.pad(X_mag_pre, ((0, 0), (0, 0), (pad_l, pad_r)), mode="constant")
90
-
91
- pred_tta = _execute(
92
- X_mag_pad, roi_size, n_window, device, model, aggressiveness, is_half
93
- )
94
- pred_tta = pred_tta[:, :, roi_size // 2 :]
95
- pred_tta = pred_tta[:, :, :n_frame]
96
-
97
- return (pred + pred_tta) * 0.5 * coef, X_mag, np.exp(1.0j * X_phase)
98
- else:
99
- return pred * coef, X_mag, np.exp(1.0j * X_phase)
100
-
101
-
102
- def _get_name_params(model_path, model_hash):
103
- data = load_data()
104
- flag = False
105
- ModelName = model_path
106
- for type in list(data):
107
- for model in list(data[type][0]):
108
- for i in range(len(data[type][0][model])):
109
- if str(data[type][0][model][i]["hash_name"]) == model_hash:
110
- flag = True
111
- elif str(data[type][0][model][i]["hash_name"]) in ModelName:
112
- flag = True
113
-
114
- if flag:
115
- model_params_auto = data[type][0][model][i]["model_params"]
116
- param_name_auto = data[type][0][model][i]["param_name"]
117
- if type == "equivalent":
118
- return param_name_auto, model_params_auto
119
- else:
120
- flag = False
121
- return param_name_auto, model_params_auto