Commit 4479fab9 authored by Shreyan Chowdhury's avatar Shreyan Chowdhury
Browse files

Initial commit

parents
__pycache__/*
results/runs*
.idea*
from utils import *
from torch.utils.data import Dataset, DataLoader
class MelSpecDataset(Dataset):
def __init__(self, phase='train', ann_root=None, spec_root=None, length=MAX_FRAMES):
assert ann_root is not None, logger.error("ann_root (root directory containing annotation files) required")
assert spec_root is not None, logger.error("spec_root (root directory of spectrograms) required")
assert phase in ['train', 'validation', 'test'], \
logger.error(f'phase should be train, validation, or test; given: {phase}')
self.ann_root = ann_root
self.spec_root = spec_root
self.length = length
xy = pd.read_csv(os.path.join(self.ann_root, f'{phase}_processed.tsv'), sep='\t')
self.len = len(xy)
# Extract the path of the mel spectrogram from the audio file name by replacing '.mp3' with '.npy'
self.x_path = xy.PATH.str.split(pat='.').str[0]+'.npy'
# Multiple tags for a song are separated by commas. Split them into elements of a list.
self.y_tags = xy.TAGS.str.split(pat=',')
self.tagslist = self._get_unique_tags_list_()
pass
def __getitem__(self, index):
"""
returns framed mel spectrogram and multi-hot encoded labels
"""
x_melspec = np.load(os.path.join(self.spec_root, self.x_path[index]))
reqd_len = self.length
spec_len = len(x_melspec)
x_trimmed = x_melspec[:reqd_len] if spec_len>reqd_len else \
np.pad(x_melspec, ((0,reqd_len-spec_len),(0,0),(0,0)), mode='wrap')
y = self.y_tags[index]
# convert tags to multi-hot encoded vector
y_multihot = np.array([int(i in y) for i in self.tagslist])
return x_trimmed, y_multihot
def __len__(self):
return self.len
def _get_unique_tags_list_(self, path='./tagslist.npy'):
try:
tagslist = np.load(path)
assert np.array_equal(tagslist, self._make_unique_tags_list_(self.y_tags)), \
logger.error("tag lists not equal")
except FileNotFoundError as e:
tagslist = self._make_unique_tags_list_(self.y_tags)
except Exception as e:
logger.error(e)
return np.array([])
return tagslist
def _make_unique_tags_list_(self, labels, saveto='./tagslist'):
labelslist = []
for label in labels:
labelslist.extend(label)
tagslist = np.sort(np.unique(np.array(labelslist)))
if saveto is not None:
np.save(saveto, tagslist)
return tagslist
if __name__=='__main__':
# Tests
torch.manual_seed(6)
dataset = MelSpecDataset(phase='train', ann_root=PATH_ANNOTATIONS, spec_root=PATH_MELSPEC_DOWNLOADED_FRAMED)
train_loader = DataLoader(dataset=dataset,
batch_size=32,
shuffle=True)
for i, data in enumerate(train_loader, 0):
spec, labels = data
pass
pass
\ No newline at end of file
from utils import *
from pytorch_lightning import Trainer
from test_tube import Experiment
from models.vgg_basic import MultiTagger
def run():
logger.info(CURR_RUN_PATH)
exp = Experiment(save_dir=CURR_RUN_PATH)
if USE_GPU:
trainer = Trainer(gpus=[0], distributed_backend='ddp',
experiment=exp, max_nb_epochs=10, train_percent_check=1.0,
fast_dev_run=False)
else:
trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.1,
fast_dev_run=True)
model = MultiTagger(num_tags=56)
trainer.fit(model)
if __name__=='__main__':
run()
\ No newline at end of file
# coding: utf-8
import math
import torch
import torch.nn as nn
import torch.nn.functional as F
from librosa.filters import mel as librosa_mel_fn
def initialize_weights(module):
if isinstance(module, nn.Conv2d):
nn.init.kaiming_normal_(module.weight.data, mode='fan_in', nonlinearity="relu")
# nn.init.kaiming_normal_(module.weight.data, mode='fan_out')
elif isinstance(module, nn.BatchNorm2d):
module.weight.data.fill_(1)
module.bias.data.zero_()
elif isinstance(module, nn.Linear):
module.bias.data.zero_()
layer_index_total = 0
def initialize_weights_fixup(module):
if isinstance(module, AttentionAvg):
print("AttentionAvg init..")
module.forw_conv[0].weight.data.zero_()
module.atten[0].bias.data.zero_()
nn.init.kaiming_normal_(module.atten[0].weight.data, mode='fan_in', nonlinearity="sigmoid")
if isinstance(module, BasicBlock):
# He init, rescaled by Fixup multiplier
b = module
n = b.conv1.kernel_size[0] * b.conv1.kernel_size[1] * b.conv1.out_channels
print(b.layer_index, math.sqrt(2. / n), layer_index_total ** (-0.5))
b.conv1.weight.data.normal_(0, (layer_index_total ** (-0.5)) * math.sqrt(2. / n))
b.conv2.weight.data.zero_()
if b.shortcut._modules.get('conv') is not None:
convShortcut = b.shortcut._modules.get('conv')
n = convShortcut.kernel_size[0] * convShortcut.kernel_size[1] * convShortcut.out_channels
convShortcut.weight.data.normal_(0, math.sqrt(2. / n))
if isinstance(module, nn.Conv2d):
pass
# nn.init.kaiming_normal_(module.weight.data, mode='fan_in', nonlinearity="relu")
# nn.init.kaiming_normal_(module.weight.data, mode='fan_out')
elif isinstance(module, nn.BatchNorm2d):
module.weight.data.fill_(1)
module.bias.data.zero_()
elif isinstance(module, nn.Linear):
module.bias.data.zero_()
first_RUN = True
def calc_padding(kernal):
try:
return kernal // 3
except TypeError:
return [k // 3 for k in kernal]
class AttentionAvg(nn.Module):
def __init__(self, in_channels, out_channels, sum_all=True):
super(AttentionAvg, self).__init__()
self.sum_dims = [2, 3]
if sum_all:
self.sum_dims = [1, 2, 3]
self.forw_conv = nn.Sequential(
nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
bias=False),
nn.BatchNorm2d(out_channels)
)
self.atten = nn.Sequential(
nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
bias=True),
nn.Sigmoid()
)
def forward(self, x):
a1 = self.forw_conv(x)
atten = self.atten(x)
num = atten.size(2) * atten.size(3)
asum = atten.sum(dim=self.sum_dims, keepdim=True) + 1e-8
return a1 * atten * num / asum
class BasicBlock(nn.Module):
expansion = 1
def __init__(self, in_channels, out_channels, stride, k1=3, k2=3):
super(BasicBlock, self).__init__()
global layer_index_total
self.layer_index = layer_index_total
layer_index_total = layer_index_total + 1
self.conv1 = nn.Conv2d(
in_channels,
out_channels,
kernel_size=k1,
stride=stride, # downsample with first conv
padding=calc_padding(k1),
bias=False)
self.bn1 = nn.BatchNorm2d(out_channels)
self.conv2 = nn.Conv2d(
out_channels,
out_channels,
kernel_size=k2,
stride=1,
padding=calc_padding(k2),
bias=False)
self.bn2 = nn.BatchNorm2d(out_channels)
self.shortcut = nn.Sequential()
if in_channels != out_channels:
self.shortcut.add_module(
'conv',
nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=stride, # downsample
padding=0,
bias=False))
self.shortcut.add_module('bn', nn.BatchNorm2d(out_channels)) # BN
def forward(self, x):
y = F.relu(self.bn1(self.conv1(x)), inplace=True)
y = self.bn2(self.conv2(y))
y += self.shortcut(x)
y = F.relu(y, inplace=True) # apply ReLU after addition
return y
class BottleneckBlock(nn.Module):
expansion = 4
def __init__(self, in_channels, out_channels, stride):
super(BottleneckBlock, self).__init__()
bottleneck_channels = out_channels // self.expansion
self.conv1 = nn.Conv2d(
in_channels,
bottleneck_channels,
kernel_size=1,
stride=1,
padding=0,
bias=False)
self.bn1 = nn.BatchNorm2d(bottleneck_channels)
self.conv2 = nn.Conv2d(
bottleneck_channels,
bottleneck_channels,
kernel_size=3,
stride=stride, # downsample with 3x3 conv
padding=1,
bias=False)
self.bn2 = nn.BatchNorm2d(bottleneck_channels)
self.conv3 = nn.Conv2d(
bottleneck_channels,
out_channels,
kernel_size=1,
stride=1,
padding=0,
bias=False)
self.bn3 = nn.BatchNorm2d(out_channels)
self.shortcut = nn.Sequential() # identity
if in_channels != out_channels:
self.shortcut.add_module(
'conv',
nn.Conv2d(
in_channels,
out_channels,
kernel_size=1,
stride=stride, # downsample
padding=0,
bias=False))
self.shortcut.add_module('bn', nn.BatchNorm2d(out_channels)) # BN
def forward(self, x):
y = F.relu(self.bn1(self.conv1(x)), inplace=True)
y = F.relu(self.bn2(self.conv2(y)), inplace=True)
y = self.bn3(self.conv3(y)) # not apply ReLU
y += self.shortcut(x)
y = F.relu(y, inplace=True) # apply ReLU after addition
return y
class Network(nn.Module):
def __init__(self, config):
super(Network, self).__init__()
input_shape = config['input_shape']
n_classes = config['n_classes']
base_channels = config['base_channels']
block_type = config['block_type']
depth = config['depth']
self.pooling_padding = config.get("pooling_padding", 0) or 0
self.use_raw_spectograms = config.get("use_raw_spectograms") or False
assert block_type in ['basic', 'bottleneck']
if self.use_raw_spectograms:
mel_basis = librosa_mel_fn(
22050, 2048, 256)
mel_basis = torch.from_numpy(mel_basis).float()
self.register_buffer('mel_basis', mel_basis)
if block_type == 'basic':
block = BasicBlock
n_blocks_per_stage = (depth - 2) // 6
assert n_blocks_per_stage * 6 + 2 == depth
else:
block = BottleneckBlock
n_blocks_per_stage = (depth - 2) // 9
assert n_blocks_per_stage * 9 + 2 == depth
n_blocks_per_stage = [n_blocks_per_stage, n_blocks_per_stage, n_blocks_per_stage]
if config.get("n_blocks_per_stage") is not None:
n_blocks_per_stage = config.get("n_blocks_per_stage")
n_channels = config.get("n_channels")
if n_channels is None:
n_channels = [
base_channels,
base_channels * 2 * block.expansion,
base_channels * 4 * block.expansion
]
self.in_c = nn.Sequential(nn.Conv2d(
input_shape[1],
n_channels[0],
kernel_size=5,
stride=2,
padding=1,
bias=False),
nn.BatchNorm2d(n_channels[0]),
nn.ReLU(True)
)
self.stage1 = self._make_stage(
n_channels[0], n_channels[0], n_blocks_per_stage[0], block, stride=1, maxpool=config['stage1']['maxpool'],
k1s=config['stage1']['k1s'], k2s=config['stage1']['k2s'])
self.stage2 = self._make_stage(
n_channels[0], n_channels[1], n_blocks_per_stage[1], block, stride=1, maxpool=config['stage2']['maxpool'],
k1s=config['stage2']['k1s'], k2s=config['stage2']['k2s'])
self.stage3 = self._make_stage(
n_channels[1], n_channels[2], n_blocks_per_stage[2], block, stride=1, maxpool=config['stage3']['maxpool'],
k1s=config['stage3']['k1s'], k2s=config['stage3']['k2s'])
ff_list = []
if config.get("attention_avg"):
if config.get("attention_avg") == "sum_all":
ff_list.append(AttentionAvg(n_channels[2], n_classes, sum_all=True))
else:
ff_list.append(AttentionAvg(n_channels[2], n_classes, sum_all=False))
else:
ff_list += [nn.Conv2d(
n_channels[2],
n_classes,
kernel_size=1,
stride=1,
padding=0,
bias=False),
nn.BatchNorm2d(n_classes),
]
self.stop_before_global_avg_pooling = False
if config.get("stop_before_global_avg_pooling"):
self.stop_before_global_avg_pooling = True
else:
ff_list.append(nn.AdaptiveAvgPool2d((1, 1)))
self.feed_forward = nn.Sequential(
*ff_list
)
# # compute conv feature size
# with torch.no_grad():
# self.feature_size = self._forward_conv(
# torch.zeros(*input_shape)).view(-1).shape[0]
#
# self.fc = nn.Linear(self.feature_size, n_classes)
# initialize weights
if config.get("weight_init") == "fixup":
self.apply(initialize_weights)
if isinstance(self.feed_forward[0], nn.Conv2d):
self.feed_forward[0].weight.data.zero_()
self.apply(initialize_weights_fixup)
else:
self.apply(initialize_weights)
def _make_stage(self, in_channels, out_channels, n_blocks, block, stride, maxpool=set(), k1s=[3, 3, 3, 3, 3, 3],
k2s=[3, 3, 3, 3, 3, 3]):
stage = nn.Sequential()
if 0 in maxpool:
stage.add_module("maxpool{}_{}".format(0, 0)
, nn.MaxPool2d(2, 2, padding=self.pooling_padding))
for index in range(n_blocks):
stage.add_module('block{}'.format(index + 1),
block(in_channels,
out_channels,
stride=stride, k1=k1s[index], k2=k2s[index]))
in_channels = out_channels
stride = 1
# if index + 1 in maxpool:
for m_i, mp_pos in enumerate(maxpool):
if index + 1 == mp_pos:
stage.add_module("maxpool{}_{}".format(index + 1, m_i)
, nn.MaxPool2d(2, 2, padding=self.pooling_padding))
return stage
def _forward_conv(self, x):
global first_RUN
if first_RUN: print("x:", x.size())
x = self.in_c(x)
if first_RUN: print("in_c:", x.size())
x = self.stage1(x)
if first_RUN: print("stage1:", x.size())
x = self.stage2(x)
if first_RUN: print("stage2:", x.size())
x = self.stage3(x)
if first_RUN: print("stage3:", x.size())
return x
def forward(self, x):
global first_RUN
if self.use_raw_spectograms:
if first_RUN: print("raw_x:", x.size())
x = torch.log10(torch.sqrt((x * x).sum(dim=3)))
if first_RUN: print("log10_x:", x.size())
x = torch.matmul(self.mel_basis, x)
if first_RUN: print("mel_basis_x:", x.size())
x = x.unsqueeze(1)
x = self._forward_conv(x)
x = self.feed_forward(x)
if first_RUN: print("feed_forward:", x.size())
if self.stop_before_global_avg_pooling:
first_RUN = False
return x
logit = x.squeeze(2).squeeze(2)
if first_RUN: print("logit:", logit.size())
first_RUN = False
return logit
<
from utils import *
from datasets import MelSpecDataset
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import pytorch_lightning as pl
from sklearn.metrics import roc_auc_score
def initialize_weights(module):
if isinstance(module, nn.Conv2d):
nn.init.kaiming_normal_(module.weight.data, mode='fan_in', nonlinearity="relu")
elif isinstance(module, nn.BatchNorm2d):
module.weight.data.fill_(1)
module.bias.data.zero_()
elif isinstance(module, nn.Linear):
module.bias.data.zero_()
class ModelMidlevelJointSerial(nn.Module):
def __init__(self, num_targets, initialize=True):
super(ModelMidlevelJointSerial, self).__init__()
self.num_targets = num_targets
self.conv1 = nn.Sequential(
nn.Conv2d(1, 64, 5, 2, 2), # (in_channels, out_channels, kernel_size, stride, padding)
nn.BatchNorm2d(64),
nn.ReLU()
)
self.conv2 = nn.Sequential(
nn.Conv2d(64, 64, 3, 1, 1),
nn.BatchNorm2d(64),
nn.ReLU()
)
self.mp2x2_dropout = nn.Sequential(
nn.MaxPool2d(2),
nn.Dropout2d(0.3)
)
self.conv3 = nn.Sequential(
nn.Conv2d(64, 128, 3, 1, 1),
nn.BatchNorm2d(128),
nn.ReLU()
)
self.conv4 = nn.Sequential(
nn.Conv2d(128, 128, 3, 1, 1),
nn.BatchNorm2d(128),
nn.ReLU()
)
self.conv5 = nn.Sequential(
nn.Conv2d(128, 256, 3, 1, 1),
nn.BatchNorm2d(256),
nn.ReLU()
)
self.conv6 = nn.Sequential(
nn.Conv2d(256, 256, 3, 1, 1),
nn.BatchNorm2d(256),
nn.ReLU()
)
self.conv7 = nn.Sequential(
nn.Conv2d(256, 384, 3, 1, 1),
nn.BatchNorm2d(384),
nn.ReLU()
)
self.conv7b = nn.Sequential(
nn.Conv2d(384, 512, 3, 1, 1),
nn.BatchNorm2d(512),
nn.ReLU()
)
self.conv11 = nn.Sequential(
nn.Conv2d(512, 256, 1, 1, 0),
nn.BatchNorm2d(256),
nn.ReLU(),
nn.AdaptiveAvgPool2d((1,1))
)
self.fc_ml = nn.Linear(256, 7)
self.fc_ml2emo = nn.Linear(7, self.num_targets)
if initialize:
self.apply(initialize_weights)
def forward(self, x):
# 313 * 149 * 1
x = self.conv1(x) # 157 * 75 * 64
x = self.conv2(x) # 157 * 75 * 64
x = self.mp2x2_dropout(x) # 78 * 37 * 64
x = self.conv3(x) # 78 * 37 * 128
x = self.conv4(x) # 78 * 37 * 128
x = self.mp2x2_dropout(x) # 39 * 18 * 128
x = self.conv5(x) # 39 * 18 * 256
x = self.conv6(x) # 39 * 18 * 256
x = self.conv7(x) # 39 * 18 * 384
x = self.conv7b(x) # 39 * 18 * 384
x = self.conv11(x) # 2 * 2 * 256
x = x.view(x.size(0), -1)
ml = self.fc_ml(x)
emo = self.fc_ml2emo(ml)
return [ml, emo]
class ModelMidlevelJointParallel(nn.Module):
def __init__(self, num_targets, initialize=True):
super(ModelMidlevelJointParallel, self).__init__()
self.num_targets = num_targets
self.conv1 = nn.Sequential(
nn.Conv2d(1, 64, 5, 2, 2), # (in_channels, out_channels, kernel_size, stride, padding)
nn.BatchNorm2d(64),
nn.ReLU()
)
self.conv2 = nn.Sequential(
nn.Conv2d(64, 64, 3, 1, 1),
nn.BatchNorm2d(64),
nn.ReLU()
)
self.mp2x2_dropout = nn.Sequential(
nn.MaxPool2d(2),
nn.Dropout2d(0.3)
)
self.conv3 = nn.Sequential(
nn.Conv2d(64, 128, 3, 1, 1),
nn.BatchNorm2d(128),
nn.ReLU()