Commit fb6183fc authored by Shreyan Chowdhury's avatar Shreyan Chowdhury
Browse files

implement crnn, make BasePtlModel base class

parent 4def4edd
import os
import getpass
hostname = os.uname()[1]
username = getpass.getuser()
if hostname == '':
path_data_cache = '/media/rk1/shared/kofta_cached_datasets'
# midlevel
path_midlevel_annotations_dir = '/media/rk1/shared/datasets/midlevel/metadata_annotations'
path_midlevel_annotations = '/media/rk1/shared/datasets/midlevel/metadata_annotations/annotations.csv'
path_midlevel_audio_dir = '/media/rk1/shared/datasets/midlevel/audio'
# mtgjamendo
path_mtgjamendo_annotations_dir = '/media/rk1/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations'
path_mtgjamendo_annotations_train = '/media/rk1/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/train_processed.tsv'
path_mtgjamendo_annotations_val = '/media/rk1/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/val_processed.tsv'
path_mtgjamendo_annotations_test = '/media/rk1/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/test_processed.tsv'
path_mtgjamendo_audio_dir = '/media/rk1/shared/datasets/MTG-Jamendo/MTG-Jamendo_audio'
if hostname == '':
path_data_cache = '/media/rk2/shared/kofta_cached_datasets'
# midlevel
path_midlevel_annotations_dir = '/media/rk2/shared/datasets/midlevel/metadata_annotations'
path_midlevel_annotations = '/media/rk2/shared/datasets/midlevel/metadata_annotations/annotations.csv'
path_midlevel_audio_dir = '/media/rk2/shared/datasets/midlevel/audio'
# mtgjamendo
path_mtgjamendo_annotations_dir = '/media/rk2/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations'
path_mtgjamendo_annotations_train = '/media/rk2/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/train_processed.tsv'
path_mtgjamendo_annotations_val = '/media/rk2/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/validation_processed.tsv'
path_mtgjamendo_annotations_test = '/media/rk2/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/test_processed.tsv'
path_mtgjamendo_audio_dir = '/media/rk2/shared/datasets/MTG-Jamendo/MTG-Jamendo_audio'
if hostname == '':
path_data_cache = '/media/rk3/shared/kofta_cached_datasets'
# midlevel
path_midlevel_annotations_dir = '/media/rk3/shared/datasets/midlevel/metadata_annotations'
path_midlevel_annotations = '/media/rk3/shared/datasets/midlevel/metadata_annotations/annotations.csv'
path_midlevel_audio_dir = '/media/rk3/shared/datasets/midlevel/audio'
# mtgjamendo
path_mtgjamendo_annotations_dir = '/media/rk3/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations'
path_mtgjamendo_annotations_train = '/media/rk3/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/train_processed.tsv'
path_mtgjamendo_annotations_val = '/media/rk3/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/validation_processed.tsv'
path_mtgjamendo_annotations_test = '/media/rk3/shared/datasets/MTG-Jamendo/MTG-Jamendo_annotations/test_processed.tsv'
path_mtgjamendo_audio_dir = '/media/rk3/shared/datasets/MTG-Jamendo/MTG-Jamendo_audio'
if hostname == 'shreyan-All-Series':
path_data_cache = '/mnt/2tb/datasets/data_caches'
# midlevel
path_midlevel_annotations_dir = '/mnt/2tb/datasets/MidlevelFeatures/metadata_annotations'
path_midlevel_annotations = '/mnt/2tb/datasets/MidlevelFeatures/metadata_annotations/annotations.csv'
path_midlevel_audio_dir = '/mnt/2tb/datasets/MidlevelFeatures/audio'
# mtgjamendo
path_mtgjamendo_annotations_dir = '/mnt/2tb/datasets/MTG-Jamendo/MTG-Jamendo_annotations'
path_mtgjamendo_annotations_train = '/mnt/2tb/datasets/MTG-Jamendo/MTG-Jamendo_annotations/train_processed.tsv'
path_mtgjamendo_annotations_val = '/mnt/2tb/datasets/MTG-Jamendo/MTG-Jamendo_annotations/validation_processed.tsv'
path_mtgjamendo_annotations_test = '/mnt/2tb/datasets/MTG-Jamendo/MTG-Jamendo_annotations/test_processed.tsv'
path_mtgjamendo_audio_dir = '/mnt/2tb/datasets/MTG-Jamendo/MTG-Jamendo_audio'
if hostname not in ['', '', '', 'shreyan-All-Series']:
raise Exception(f"Paths not defined for {hostname}")
\ No newline at end of file
...@@ -83,8 +83,9 @@ def run(hparams): ...@@ -83,8 +83,9 @@ def run(hparams):
experiment=exp, max_nb_epochs=config['epochs'], train_percent_check=hparams.train_percent, experiment=exp, max_nb_epochs=config['epochs'], train_percent_check=hparams.train_percent,
fast_dev_run=False, fast_dev_run=False,
early_stop_callback=early_stop, early_stop_callback=early_stop,
checkpoint_callback=checkpoint_callback checkpoint_callback=checkpoint_callback,
) nb_sanity_val_steps=0) # don't run sanity validation run
else: else:
trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.01, trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.01,
fast_dev_run=True) fast_dev_run=True)
from utils import USE_GPU, init_experiment
from pytorch_lightning import Trainer
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from test_tube import Experiment, HyperOptArgumentParser
from models.crnn import CRNN as Network
import os
model_config = {
'validation_metrics':['rocauc', 'prauc'],
'test_metrics':['rocauc', 'prauc']
initialized = False # TODO: Find a better way to do this
def run(hparams):
if not initialized:
from utils import CURR_RUN_PATH, logger # import these after init_experiment"tensorboard --logdir={CURR_RUN_PATH}")
exp = Experiment(save_dir=CURR_RUN_PATH)
# callbacks
early_stop = EarlyStopping(
checkpoint_callback = ModelCheckpoint(
filepath=os.path.join(CURR_RUN_PATH, 'best.ckpt'),
trainer = Trainer(gpus=[0], distributed_backend=None,
experiment=exp, max_nb_epochs=hparams.max_epochs,
fast_dev_run=False, early_stop_callback=early_stop,
nb_sanity_val_steps=0) # don't run sanity validation run
trainer = Trainer(experiment=exp, max_nb_epochs=1, train_percent_check=0.1,
model = Network(num_class=56, config=model_config, hparams=hparams)
if __name__=='__main__':
parent_parser = HyperOptArgumentParser(strategy='grid_search', add_help=False)
parent_parser.add_argument('--experiment_name', type=str,
default='pt_lightning_exp_a', help='test tube exp name')
parent_parser.add_argument('--train_percent', type=float,
default=1.0, help='how much train data to use')
parent_parser.add_argument('--max_epochs', type=int,
default=10, help='maximum number of epochs')
#parent_parser.add_argument('--gpus', type=list, default=[0,1],
# help='how many gpus to use in the node.'
# ' value -1 uses all the gpus on the node')
parser = Network.add_model_specific_args(parent_parser)
hyperparams = parser.parse_args()
# run(hyperparams)
#gpus = ['cuda:0', 'cuda:1']
#hyperparams.optimize_parallel_gpu(run, gpus, 5)
for hparam_trial in hyperparams.trials(20):
...@@ -47,25 +47,9 @@ def pretrain_midlevel(hparams): ...@@ -47,25 +47,9 @@ def pretrain_midlevel(hparams):"tensorboard --logdir={CURR_RUN_PATH}")"tensorboard --logdir={CURR_RUN_PATH}")
exp = Experiment(name='midlevel', save_dir=CURR_RUN_PATH) exp = Experiment(name='midlevel', save_dir=CURR_RUN_PATH)
def setup_config(): midlevel_configs()
def print_config():
global config
st = '---------CONFIG--------\n'
for k in config.keys():
st += k+':'+str(config.get(k))+'\n'
return st
conf = hparams.config
if conf is not None:
conf_func = globals()[conf]
logger.error(f"Config {conf} not defined")
early_stop = EarlyStopping( early_stop = EarlyStopping(
monitor=config['earlystopping_metric'], monitor=config['earlystopping_metric'],
import math
import sys
import time
import numpy as np
import wave
import scipy
import scipy.signal
from pylab import *
import array
import os
from os.path import expanduser
# Author: Brian K. Vogel
def hz_to_mel(f_hz):
"""Convert Hz to mel scale.
This uses the formula from O'Shaugnessy's book.
f_hz (float): The value in Hz.
The value in mels.
return 2595*np.log10(1.0 + f_hz/700.0)
def mel_to_hz(m_mel):
"""Convert mel scale to Hz.
This uses the formula from O'Shaugnessy's book.
m_mel (float): The value in mels
The value in Hz
return 700*(10**(m_mel/2595) - 1.0)
def fft_bin_to_hz(n_bin, sample_rate_hz, fft_size):
"""Convert FFT bin index to frequency in Hz.
n_bin (int or float): The FFT bin index.
sample_rate_hz (int or float): The sample rate in Hz.
fft_size (int or float): The FFT size.
The value in Hz.
n_bin = float(n_bin)
sample_rate_hz = float(sample_rate_hz)
fft_size = float(fft_size)
return n_bin*sample_rate_hz/(2.0*fft_size)
def hz_to_fft_bin(f_hz, sample_rate_hz, fft_size):
"""Convert frequency in Hz to FFT bin index.
f_hz (int or float): The frequency in Hz.
sample_rate_hz (int or float): The sample rate in Hz.
fft_size (int or float): The FFT size.
The FFT bin index as an int.
f_hz = float(f_hz)
sample_rate_hz = float(sample_rate_hz)
fft_size = float(fft_size)
fft_bin = int(np.round((f_hz*2.0*fft_size/sample_rate_hz)))
if fft_bin >= fft_size:
fft_bin = fft_size-1
return fft_bin
def make_mel_filterbank(min_freq_hz, max_freq_hz, mel_bin_count,
linear_bin_count, sample_rate_hz):
"""Create a mel filterbank matrix.
Create and return a mel filterbank matrix `filterbank` of shape (`mel_bin_count`,
`linear_bin_couont`). The `filterbank` matrix can be used to transform a
(linear scale) spectrum or spectrogram into a mel scale spectrum or
spectrogram as follows:
`mel_scale_spectrum` = `filterbank`*'linear_scale_spectrum'
where linear_scale_spectrum' is a shape (`linear_bin_count`, `m`) and
`mel_scale_spectrum` is shape ('mel_bin_count', `m`) where `m` is the number
of spectral time slices.
Likewise, the reverse-direction transform can be performed as:
'linear_scale_spectrum' = filterbank.T`*`mel_scale_spectrum`
Note that the process of converting to mel scale and then back to linear
scale is lossy.
This function computes the mel-spaced filters such that each filter is triangular
(in linear frequency) with response 1 at the center frequency and decreases linearly
to 0 upon reaching an adjacent filter's center frequency. Note that any two adjacent
filters will overlap having a response of 0.5 at the mean frequency of their
respective center frequencies.
min_freq_hz (float): The frequency in Hz corresponding to the lowest
mel scale bin.
max_freq_hz (flloat): The frequency in Hz corresponding to the highest
mel scale bin.
mel_bin_count (int): The number of mel scale bins.
linear_bin_count (int): The number of linear scale (fft) bins.
sample_rate_hz (float): The sample rate in Hz.
The mel filterbank matrix as an 2-dim Numpy array.
min_mels = hz_to_mel(min_freq_hz)
max_mels = hz_to_mel(max_freq_hz)
# Create mel_bin_count linearly spaced values between these extreme mel values.
mel_lin_spaced = np.linspace(min_mels, max_mels, num=mel_bin_count)
# Map each of these mel values back into linear frequency (Hz).
center_frequencies_hz = np.array([mel_to_hz(n) for n in mel_lin_spaced])
mels_per_bin = float(max_mels - min_mels)/float(mel_bin_count - 1)
mels_start = min_mels - mels_per_bin
hz_start = mel_to_hz(mels_start)
fft_bin_start = hz_to_fft_bin(hz_start, sample_rate_hz, linear_bin_count)
#print('fft_bin_start: ', fft_bin_start)
mels_end = max_mels + mels_per_bin
hz_stop = mel_to_hz(mels_end)
fft_bin_stop = hz_to_fft_bin(hz_stop, sample_rate_hz, linear_bin_count)
#print('fft_bin_stop: ', fft_bin_stop)
# Map each center frequency to the closest fft bin index.
linear_bin_indices = np.array([hz_to_fft_bin(f_hz, sample_rate_hz, linear_bin_count) for f_hz in center_frequencies_hz])
# Create filterbank matrix.
filterbank = np.zeros((mel_bin_count, linear_bin_count))
for mel_bin in range(mel_bin_count):
center_freq_linear_bin = linear_bin_indices[mel_bin]
# Create a triangular filter having the current center freq.
# The filter will start with 0 response at left_bin (if it exists)
# and ramp up to 1.0 at center_freq_linear_bin, and then ramp
# back down to 0 response at right_bin (if it exists).
# Create the left side of the triangular filter that ramps up
# from 0 to a response of 1 at the center frequency.
if center_freq_linear_bin > 1:
# It is possible to create the left triangular filter.
if mel_bin == 0:
# Since this is the first center frequency, the left side
# must start ramping up from linear bin 0 or 1 mel bin before the center freq.
left_bin = max(0, fft_bin_start)
# Start ramping up from the previous center frequency bin.
left_bin = linear_bin_indices[mel_bin - 1]
for f_bin in range(left_bin, center_freq_linear_bin+1):
if (center_freq_linear_bin - left_bin) > 0:
response = float(f_bin - left_bin)/float(center_freq_linear_bin - left_bin)
filterbank[mel_bin, f_bin] = response
# Create the right side of the triangular filter that ramps down
# from 1 to 0.
if center_freq_linear_bin < linear_bin_count-2:
# It is possible to create the right triangular filter.
if mel_bin == mel_bin_count - 1:
# Since this is the last mel bin, we must ramp down to response of 0
# at the last linear freq bin.
right_bin = min(linear_bin_count - 1, fft_bin_stop)
right_bin = linear_bin_indices[mel_bin + 1]
for f_bin in range(center_freq_linear_bin, right_bin+1):
if (right_bin - center_freq_linear_bin) > 0:
response = float(right_bin - f_bin)/float(right_bin - center_freq_linear_bin)
filterbank[mel_bin, f_bin] = response
filterbank[mel_bin, center_freq_linear_bin] = 1.0
return filterbank
def stft_for_reconstruction(x, fft_size, hopsamp):
"""Compute and return the STFT of the supplied time domain signal x.
x (1-dim Numpy array): A time domain signal.
fft_size (int): FFT size. Should be a power of 2, otherwise DFT will be used.
hopsamp (int):
The STFT. The rows are the time slices and columns are the frequency bins.
window = np.hanning(fft_size)
fft_size = int(fft_size)
hopsamp = int(hopsamp)
return np.array([np.fft.rfft(window*x[i:i+fft_size])
for i in range(0, len(x)-fft_size, hopsamp)])
def istft_for_reconstruction(X, fft_size, hopsamp):
"""Invert a STFT into a time domain signal.
X (2-dim Numpy array): Input spectrogram. The rows are the time slices and columns are the frequency bins.
fft_size (int):
hopsamp (int): The hop size, in samples.
The inverse STFT.
fft_size = int(fft_size)
hopsamp = int(hopsamp)
window = np.hanning(fft_size)
time_slices = X.shape[0]
len_samples = int(time_slices*hopsamp + fft_size)
x = np.zeros(len_samples)
for n,i in enumerate(range(0, len(x)-fft_size, hopsamp)):
x[i:i+fft_size] += window*np.real(np.fft.irfft(X[n]))
return x
def get_signal(in_file, expected_fs=44100):
"""Load a wav file.
If the file contains more than one channel, return a mono file by taking
the mean of all channels.
If the sample rate differs from the expected sample rate (default is 44100 Hz),
raise an exception.
in_file: The input wav file, which should have a sample rate of `expected_fs`.
expected_fs (int): The expected sample rate of the input wav file.
The audio siganl as a 1-dim Numpy array. The values will be in the range [-1.0, 1.0]. fixme ( not yet)
fs, y =
num_type = y[0].dtype
if num_type == 'int16':
y = y*(1.0/32768)
elif num_type == 'int32':
y = y*(1.0/2147483648)
elif num_type == 'float32':
# Nothing to do
elif num_type == 'uint8':
raise Exception('8-bit PCM is not supported.')
raise Exception('Unknown format.')
if fs != expected_fs:
raise Exception('Invalid sample rate.')
if y.ndim == 1:
return y
return y.mean(axis=1)
def reconstruct_signal_griffin_lim(magnitude_spectrogram, fft_size, hopsamp, iterations):
"""Reconstruct an audio signal from a magnitude spectrogram.
Given a magnitude spectrogram as input, reconstruct
the audio signal and return it using the Griffin-Lim algorithm from the paper:
"Signal estimation from modified short-time fourier transform" by Griffin and Lim,
in IEEE transactions on Acoustics, Speech, and Signal Processing. Vol ASSP-32, No. 2, April 1984.
magnitude_spectrogram (2-dim Numpy array): The magnitude spectrogram. The rows correspond to the time slices
and the columns correspond to frequency bins.
fft_size (int): The FFT size, which should be a power of 2.
hopsamp (int): The hope size in samples.
iterations (int): Number of iterations for the Griffin-Lim algorithm. Typically a few hundred
is sufficient.
The reconstructed time domain signal as a 1-dim Numpy array.
time_slices = magnitude_spectrogram.shape[0]
len_samples = int(time_slices*hopsamp + fft_size)
# Initialize the reconstructed signal to noise.
x_reconstruct = np.random.randn(len_samples)
n = iterations # number of iterations of Griffin-Lim algorithm.
while n > 0:
n -= 1
reconstruction_spectrogram = stft_for_reconstruction(x_reconstruct, fft_size, hopsamp)
reconstruction_angle = np.angle(reconstruction_spectrogram)
# Discard magnitude part of the reconstruction and use the supplied magnitude spectrogram instead.
proposal_spectrogram = magnitude_spectrogram*np.exp(1.0j*reconstruction_angle)
prev_x = x_reconstruct
x_reconstruct = istft_for_reconstruction(proposal_spectrogram, fft_size, hopsamp)
diff = sqrt(sum((x_reconstruct - prev_x)**2)/x_reconstruct.size)
print('Reconstruction iteration: {}/{} RMSE: {} '.format(iterations - n, iterations, diff))
return x_reconstruct
def save_audio_to_file(x, sample_rate, outfile='out.wav'):
"""Save a mono signal to a file.
x (1-dim Numpy array): The audio signal to save. The signal values should be in the range [-1.0, 1.0].
sample_rate (int): The sample rate of the signal, in Hz.
outfile: Name of the file to save.
x_max = np.max(abs(x))
assert x_max <= 1.0, 'Input audio value is out of range. Should be in the range [-1.0, 1.0].'
x = x*32767.0
data = array.array('h')
for i in range(len(x)):
cur_samp = int(round(x[i]))
f =, 'w')
f.setparams((1, 2, sample_rate, 0, "NONE", "Uncompressed"))
\ No newline at end of file
from utils import * from utils import *
import pytorch_lightning as pl import pytorch_lightning as pl
from models.shared_stuff import * from models.shared_stuff import *
from sklearn import metrics from sklearn import metrics
# TODO pr-auc # TODO pr-auc
...@@ -87,7 +85,7 @@ class CNN(pl.LightningModule): ...@@ -87,7 +85,7 @@ class CNN(pl.LightningModule):
def training_step(self, data_batch, batch_nb): def training_step(self, data_batch, batch_nb):
x, _, y = data_batch x, _, y = data_batch
y_hat = self.forward_full_song(x, y) y_hat = self.forward(x)
y = y.float() y = y.float()
y_hat = y_hat.float() y_hat = y_hat.float()
return {'loss':self.my_loss(y_hat, y)} return {'loss':self.my_loss(y_hat, y)}
from test_tube import HyperOptArgumentParser
from utils import *
from models.shared_stuff import BasePtlModel
class CRNN(BasePtlModel):
def __init__(self, config, num_class, hparams):
super(CRNN, self).__init__(config, hparams)
# init bn
self.bn_init = nn.BatchNorm2d(1)
# layer 1
self.conv_1 = nn.Conv2d(1, 64, 3, padding=1)
self.bn_1 = nn.BatchNorm2d(64)
self.mp_1 = nn.MaxPool2d((2, 4))
# layer 2
self.conv_2 = nn.Conv2d(64, 128, 3, padding=1)
self.bn_2 = nn.BatchNorm2d(128)
self.mp_2 = nn.MaxPool2d((2, 4))
# layer 3
self.conv_3 = nn.Conv2d(128, 128, 3, padding=1)
self.bn_3 = nn.BatchNorm2d(128)
self.mp_3 = nn.MaxPool2d((2, 4))
# layer 4
self.conv_4 = nn.Conv2d(128, 128, 3, padding=1)
self.bn_4 = nn.BatchNorm2d(128)
self.mp_4 = nn.MaxPool2d((3, 5))
# layer 5
self.conv_5 = nn.Conv2d(128, 64, 3, padding=1)
self.bn_5 = nn.BatchNorm2d(64)
self.mp_5 = nn.MaxPool2d((4, 4))
# recurrent layer
self.gru1 = nn.GRU(input_size=32,
# classifier
self.dense = nn.Linear(hparams.gru_hidden_size, num_class)
self.dropout = nn.Dropout(self.hparams.drop_prob)
def forward(self, x):
x = x[:, :, :, :512]
# init bn
x = self.bn_init(x)
# layer 1
x = self.mp_1(nn.ELU()(self.bn_1(self.conv_1(x))))
# layer 2
x = nn.ELU()(self.bn_2(self.conv_2(x)))
# x = self.mp_2(nn.ELU()(self.bn_2(self.conv_2(x))))
# layer 3
x = self.mp_3(nn.ELU()(self.bn_3(self.conv_3(x))))
# layer 4
x = self.mp_4(nn.ELU()(self.bn_4(self.conv_4(x))))
# layer 5
x = self.mp_5(nn.ELU()(self.bn_5(self.conv_5(x))))
# classifier
x = x.view(-1, x.size(0), 32)
x = self.gru1(x)[1][1] # TODO: Check if this is correct
x = self.dropout(x)
logit = nn.Sigmoid()(self.dense(x))
return logit
def add_model_specific_args(parent_parser):
"""Parameters defined here will be available to your model through self.hparams
parser = HyperOptArgumentParser(strategy=parent_parser.strategy, parents=[parent_parser])
# network params
parser.add_argument('--gru_hidden_size', default=320, type=int)
parser.add_argument('--gru_num_layers', default=2, type=int)
parser.opt_list('--drop_prob', default=0.2, options=[0.2, 0.5], type=float, tunable=True)
parser.opt_list('--learning_rate', default=0.0001, type=float,
options=[0.00001, 0.0005, 0.001],
# training params (opt)
parser.opt_list('--optimizer_name', default='adam', type=str,
options=['adam'], tunable=False)
# if using 2 nodes with 4 gpus each the batch size here
# (256) will be 256 / (2*8) = 16 per gpu
parser.opt_list('--batch_size', default=32, type=int,
options=[32, 64, 128, 256], tunable=False,
help='batch size will be divided over all gpus being used across all nodes')
return parser
from datasets.midlevel import df_get_midlevel_set
from torch import optim
from import random_split
import torch