spectrogram_processors.py 5.75 KB
Newer Older
Shreyan Chowdhury's avatar
Shreyan Chowdhury committed
1
from utils import *
Shreyan Chowdhury's avatar
Shreyan Chowdhury committed
2
import h5py
Shreyan Chowdhury's avatar
Shreyan Chowdhury committed
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92

def trim_silence(spec, thresh=0.1):
    """
    Trims silence from the beginning and end of a song spectrogram based on a threshold
    applied to the median loudness. Loudness is calculated by summing the magnitudes
    over the frequency axis for each time frame.
    """
    loudness = np.sum(spec, axis=0)
    loudness = loudness - np.min(loudness)
    cutoff = thresh*np.median(loudness)
    start = 0
    end = len(loudness)
    for i in range(len(loudness)):
        if loudness[i] > cutoff:
            start = i
            break
    for i in range(len(loudness)-1, start, -1):
        if loudness[i] > cutoff:
            end = i
            break
    return spec[:,start:end]


def make_framed_spec(spec, frame_length, total_frames=None,
                hop=0.5, discard_end=False, filler='wrap'):
    """
    Given a spectrogram of an entire song, this function splits it into frames and returns
    a torch tensor with an additional dimension (frame number) and the framed spectrogram
    chunks. Each frame is meant to be directly fed into a model input of matching size.
    """
    assert filler in ['wrap', 'pad'], logger.error(f"filler is {filler}, must be either wrap or pad")
    fstart = 0
    fend = int(frame_length)
    framed_spec = []

    while fend < spec.shape[1]:
        framed_spec.append(spec[:,fstart:fend])
        fstart += int(hop*frame_length)
        fend = fstart + int(frame_length)
    if not discard_end:
        framed_spec.append(spec[:,-frame_length:])

    if total_frames is not None:
        if len(framed_spec) > total_frames:
            framed_spec = framed_spec[:total_frames]
        else:
            if filler in ['wrap']:
                # Wrap around
                while len(framed_spec) < total_frames:
                    framed_spec.extend(framed_spec[0:total_frames-len(framed_spec)])
            else:
                # Pad with silence
                silence = np.zeros(spec[:,0:frame_length].shape)
                while len(framed_spec) < total_frames:
                    framed_spec.extend(silence)

    framed_spec = torch.from_numpy(np.array(framed_spec))
    return framed_spec


def preprocess_specs(source_root, destination_root, frame_length=256, hop=1.0):
    """
    Reads spectrograms from source_root and performs:
        - trim_silence()
        - make_framed_spec()
    and saves the resulting framed spectrograms to destination_root
    """
    if not os.path.exists(destination_root):
        os.mkdir(destination_root)

    filelist = os.walk(source_root)

    for dirpath, _, filenames in filelist:
        # Ignore dir of framed specs
        if dirpath is destination_root:
            continue

        for filename in tqdm(filenames):
            destination_subdir = os.path.join(destination_root, dirpath.split('/')[-1])
            if os.path.exists(os.path.join(destination_subdir, filename)):
                # If framed melspec already exists, don't preprocess
                continue
            else:
                if not os.path.exists(destination_subdir):
                    os.mkdir(destination_subdir)
                destination = os.path.join(destination_subdir, filename)
                spec = np.load(os.path.join(dirpath, filename))
                spec = trim_silence(spec)
                framed_spec = make_framed_spec(spec, frame_length=frame_length, hop=hop)
                np.save(destination, framed_spec)
Shreyan Chowdhury's avatar
Shreyan Chowdhury committed
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117


def hdf_cache_specs(source_root, destination_root, annotations_file='all', output_filename=None):
    """
    Makes HDF5 cache of spectrograms
    """
    if annotations_file == 'all':
        ann_tr = pd.read_csv(os.path.join(PATH_ANNOTATIONS, f'train_processed.tsv'), sep='\t')
        ann_val = pd.read_csv(os.path.join(PATH_ANNOTATIONS, f'validation_processed.tsv'), sep='\t')
        ann_test = pd.read_csv(os.path.join(PATH_ANNOTATIONS, f'test_processed.tsv'), sep='\t')
        annotations = pd.concat([ann_tr, ann_val, ann_test], axis=0, ignore_index=True)
    else:
        annotations = pd.read_csv(annotations_file, sep='\t')

    if not os.path.exists(destination_root):
        os.mkdir(destination_root)

    if output_filename is None:
        output_filename = 'dataset.h5'

    hdf_filepath = os.path.join(destination_root, output_filename)

    tagslist = np.load(os.path.join(PATH_PROJECT_ROOT, 'tagslist.npy'))

    with h5py.File(hdf_filepath, 'w') as hdf:
118
        for idx in tqdm(annotations.index[:10]):
Shreyan Chowdhury's avatar
Shreyan Chowdhury committed
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
            filename = annotations.PATH.iloc[idx].split('.')[0] # discard '.mp3' extension
            labels_str = annotations.TAGS.iloc[idx] # get labels in string format
            labels_onehot = np.array([int(i in labels_str) for i in tagslist]) # convert to onehot encoding
            filepath = os.path.join(source_root, filename+'.npy') # path of the melspectrogram stored in .npy format
            spec = np.load(filepath) # load the spec from .npy
            spec = trim_silence(spec) # trim silence
            song = hdf.create_group(filename.split('/')[1])
            song.create_dataset('data', data=spec)
            song.create_dataset('label', data=labels_onehot)


if __name__=='__main__':
    # pass
    hdf_cache_specs(PATH_MELSPEC_DOWNLOADED, os.path.join(PATH_DATA_ROOT, 'HDF5Cache_spectrograms'),
                    annotations_file=os.path.join(PATH_ANNOTATIONS, 'train_processed.tsv'), output_filename='train.h5')
    hdf_cache_specs(PATH_MELSPEC_DOWNLOADED, os.path.join(PATH_DATA_ROOT, 'HDF5Cache_spectrograms'),
                    annotations_file=os.path.join(PATH_ANNOTATIONS, 'validation_processed.tsv'), output_filename='val.h5')
    hdf_cache_specs(PATH_MELSPEC_DOWNLOADED, os.path.join(PATH_DATA_ROOT, 'HDF5Cache_spectrograms'),
                    annotations_file=os.path.join(PATH_ANNOTATIONS, 'test_processed.tsv'), output_filename='test.h5')
    #     pass