Open chamecall opened 9 months ago
I guess it's like that:
class MobileNetV1_DecisionLevelMax(nn.Module): def __init__(self, sample_rate, window_size, hop_size, mel_bins, fmin, fmax, classes_num): super(MobileNetV1_DecisionLevelMax, self).__init__() window = 'hann' center = True pad_mode = 'reflect' ref = 1.0 amin = 1e-10 top_db = None self.interpolate_ratio = 32 # Downsampled ratio # Spectrogram extractor self.spectrogram_extractor = Spectrogram(n_fft=window_size, hop_length=hop_size, win_length=window_size, window=window, center=center, pad_mode=pad_mode, freeze_parameters=True) # Logmel feature extractor self.logmel_extractor = LogmelFilterBank(sr=sample_rate, n_fft=window_size, n_mels=mel_bins, fmin=fmin, fmax=fmax, ref=ref, amin=amin, top_db=top_db, freeze_parameters=True) # Spec augmenter self.spec_augmenter = SpecAugmentation(time_drop_width=64, time_stripes_num=2, freq_drop_width=8, freq_stripes_num=2) self.bn0 = nn.BatchNorm2d(64) def conv_bn(inp, oup, stride): _layers = [ nn.Conv2d(inp, oup, 3, 1, 1, bias=False), nn.AvgPool2d(stride), nn.BatchNorm2d(oup), nn.ReLU(inplace=True) ] _layers = nn.Sequential(*_layers) init_layer(_layers[0]) init_bn(_layers[2]) return _layers def conv_dw(inp, oup, stride): _layers = [ nn.Conv2d(inp, inp, 3, 1, 1, groups=inp, bias=False), nn.AvgPool2d(stride), nn.BatchNorm2d(inp), nn.ReLU(inplace=True), nn.Conv2d(inp, oup, 1, 1, 0, bias=False), nn.BatchNorm2d(oup), nn.ReLU(inplace=True) ] _layers = nn.Sequential(*_layers) init_layer(_layers[0]) init_bn(_layers[2]) init_layer(_layers[4]) init_bn(_layers[5]) return _layers self.features = nn.Sequential( conv_bn( 1, 32, 2), conv_dw( 32, 64, 1), conv_dw( 64, 128, 2), conv_dw(128, 128, 1), conv_dw(128, 256, 2), conv_dw(256, 256, 1), conv_dw(256, 512, 2), conv_dw(512, 512, 1), conv_dw(512, 512, 1), conv_dw(512, 512, 1), conv_dw(512, 512, 1), conv_dw(512, 512, 1), conv_dw(512, 1024, 2), conv_dw(1024, 1024, 1)) self.fc1 = nn.Linear(1024, 1024, bias=True) self.fc_audioset = nn.Linear(1024, classes_num, bias=True) self.init_weights() def init_weights(self): init_bn(self.bn0) init_layer(self.fc1) init_layer(self.fc_audioset) def forward(self, input, mixup_lambda=None): """ Input: (batch_size, data_length)""" x = self.spectrogram_extractor(input) # (batch_size, 1, time_steps, freq_bins) x = self.logmel_extractor(x) # (batch_size, 1, time_steps, mel_bins) frames_num = x.shape[2] x = x.transpose(1, 3) x = self.bn0(x) x = x.transpose(1, 3) if self.training: x = self.spec_augmenter(x) # Mixup on spectrogram if self.training and mixup_lambda is not None: x = do_mixup(x, mixup_lambda) x = self.features(x) x = torch.mean(x, dim=3) x1 = F.max_pool1d(x, kernel_size=3, stride=1, padding=1) x2 = F.avg_pool1d(x, kernel_size=3, stride=1, padding=1) # (x1, _) = torch.max(x, dim=2) # x2 = torch.mean(x, dim=2) x = x1 + x2 x = F.dropout(x, p=0.5, training=self.training) x = x.transpose(1, 2) x = F.relu_(self.fc1(x)) x = F.dropout(x, p=0.5, training=self.training) segmentwise_output = torch.sigmoid(self.fc_audioset(x)) (clipwise_output, _) = torch.max(segmentwise_output, dim=1) # Get framewise output framewise_output = interpolate(segmentwise_output, self.interpolate_ratio) framewise_output = pad_framewise_output(framewise_output, frames_num) output_dict = {'framewise_output': framewise_output, 'clipwise_output': clipwise_output} return output_dict
but yeah, considering that the model wasn't trained for this exact purpose it doesn't really better than to split raw waveform in fixed chunks and predict clipwise scores for each of them
Hi, do you know in which work these frame-wise architectures are explained?
I guess it's like that:
but yeah, considering that the model wasn't trained for this exact purpose it doesn't really better than to split raw waveform in fixed chunks and predict clipwise scores for each of them