crnn_1.py 5.74 KB
Newer Older
Richard Vogl's avatar
Richard Vogl committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
#!/usr/bin/env python
import numpy as np

import theano

import lasagne
from lasagne.layers.dnn import Conv2DDNNLayer as Conv2DLayer
from lasagne.layers.dnn import batch_norm_dnn as batch_norm
from lasagne.layers.dnn import MaxPool2DDNNLayer as MaxPool2DLayer


from lasagne.layers import DropoutLayer, FlattenLayer, ReshapeLayer, GRULayer, ConcatLayer

from piano_transcription.data.data_pools import BatchIterator
from piano_transcription.layers.gradient import GradientClipLayer

INI_LEARNING_RATE = np.float32(0.001)

BATCH_SIZE = 8
MAX_EPOCHS = 1000
PATIENCE = 6
L2 = None

RNN_GRAD_CLIP = 50
FF_GRAD_CLIP = 50

SPEC_BINS = 168
OUT_LEN = 88

SPEC_CONTEXT = 13
STEP_SIZE = 10
Richard Vogl's avatar
Richard Vogl committed
32
SEQ_LENGTH = 100
Richard Vogl's avatar
Richard Vogl committed
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
CENTRAL_TARGET = True

N_HIDDEN = 60

init_conv = lasagne.init.HeNormal

MAX_PRED_SIZE = 100
USE_BATCHED_PREDICT = True


def get_valid_batch_iterator():
    def batch_iterator(batch_size, k_samples, shuffle):
        return BatchIterator(batch_size=batch_size, prepare=prepare, k_samples=k_samples, shuffle=shuffle)
    return batch_iterator


def get_train_batch_iterator():
    def batch_iterator(batch_size, k_samples, shuffle):
        return BatchIterator(batch_size=batch_size, prepare=prepare_train, k_samples=k_samples, shuffle=shuffle)
    return batch_iterator


Richard Vogl's avatar
Richard Vogl committed
55
def predict(pfun, X, max_seq_len, out_len):
Richard Vogl's avatar
Richard Vogl committed
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
    seq_len, feat_len = X.shape

    n_batches = int(np.ceil(seq_len / float(MAX_PRED_SIZE)))
    if USE_BATCHED_PREDICT:
        padded_len = n_batches * MAX_PRED_SIZE
    else:
        padded_len = max_seq_len

    m_b = np.ones((1, seq_len), dtype=theano.config.floatX)
    x_b_p = np.pad(X, ((0, padded_len - seq_len), (0, 0)), 'constant')
    m_b = np.pad(m_b, ((0, 0), (0, padded_len - seq_len)), 'constant')

    pad_width = (SPEC_CONTEXT - 1) / 2
    x_b_p = np.pad(x_b_p, ((pad_width, pad_width), (0, 0)), 'constant')

    step_size = 1
    indices = np.arange(pad_width, x_b_p.shape[0] - pad_width, step_size).astype(np.int)
    n_seq_pred = len(indices)
    shape = [1, n_seq_pred, 1, SPEC_CONTEXT, feat_len]
    X_pred = np.zeros(shape, dtype=theano.config.floatX)
    for o_idx, x_idx in enumerate(indices):
        X_pred[0, o_idx, 0, :, :] = x_b_p[x_idx - pad_width:x_idx + pad_width + 1, :]

    if USE_BATCHED_PREDICT:
        p_b = np.zeros((seq_len, out_len))
        for batch in xrange(n_batches):
            i0 = batch * MAX_PRED_SIZE
            i1 = i0 + MAX_PRED_SIZE
            i1o = min(i1, seq_len)
Richard Vogl's avatar
Richard Vogl committed
85
            p_b[i0:i1o] = pfun([X_pred[:, i0:i1], m_b[:, i0:i1]])[0, 0:(i1o-i0)]
Richard Vogl's avatar
Richard Vogl committed
86
    else:
Richard Vogl's avatar
Richard Vogl committed
87
        p_b = pfun([X_pred, m_b])[0, :seq_len]
Richard Vogl's avatar
Richard Vogl committed
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163

    return p_b


def prepare(x, y):
    x = x[:, :, np.newaxis]
    y = np.squeeze(y)
    m = np.ones((x.shape[0], x.shape[1]), dtype=theano.config.floatX)  # x batch size, and x feat length -> rnn is after conv and reshape...
    return x, m, y


def prepare_train(x, y):
    x, m, y = prepare(x, y)
    return x, m, y


def build_eval_model(max_seq_len, feat_len, out_len):
    if USE_BATCHED_PREDICT:
        return build_model(batch_size=1, seq_length=MAX_PRED_SIZE, feat_len=feat_len, out_len=out_len)
    else:
        return build_model(batch_size=1, seq_length=max_seq_len, feat_len=feat_len, out_len=out_len)


def build_model(batch_size=BATCH_SIZE, seq_length=SEQ_LENGTH, feat_len=SPEC_BINS, out_len=OUT_LEN):
    """ Compile net architecture """
    nonlin = lasagne.nonlinearities.rectify

    # --- input layers ---
    l_in = lasagne.layers.InputLayer(shape=(batch_size, seq_length, 1, SPEC_CONTEXT, feat_len))
    l_mask = lasagne.layers.InputLayer(shape=(batch_size, seq_length))
    net = l_in

    # --- reshape for convolutions ---
    net = ReshapeLayer(net, shape=(batch_size * seq_length, 1, SPEC_CONTEXT, feat_len))

    # --- conv layers ---
    net = Conv2DLayer(net, num_filters=32, filter_size=3, stride=1, pad=0, W=init_conv(), nonlinearity=nonlin)
    net = batch_norm(net)
    net = Conv2DLayer(net, num_filters=32, filter_size=3, stride=1, pad=0, W=init_conv(), nonlinearity=nonlin)
    net = batch_norm(net)
    net = MaxPool2DLayer(net, pool_size=(1, 3))
    net = DropoutLayer(net, p=0.3)

    net = Conv2DLayer(net, num_filters=64, filter_size=3, stride=1, pad=0, W=init_conv(), nonlinearity=nonlin)
    net = batch_norm(net)
    # net = Conv2DLayer(net, num_filters=64, filter_size=3, stride=1, pad=0, W=init_conv(), nonlinearity=nonlin)
    # net = batch_norm(net)
    net = MaxPool2DLayer(net, pool_size=(1, 3))
    net = DropoutLayer(net, p=0.3)

    # --- reshape to rnn format ---
    net = FlattenLayer(net)
    net = ReshapeLayer(net, (batch_size, seq_length, -1))

    # --- rnn part ---
    l_forward1 = GRULayer(net, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP)
    l_backward1 = GRULayer(net, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP, backwards=True)
    l_concat1 = ConcatLayer([l_forward1, l_backward1], axis=2)

    l_forward2 = GRULayer(l_concat1, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP)
    l_backward2 = GRULayer(l_concat1, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP, backwards=True)
    l_concat2 = ConcatLayer([l_forward2, l_backward2], axis=2)

    l_forward3 = GRULayer(l_concat2, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP)
    l_backward3 = GRULayer(l_concat2, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP, backwards=True)
    l_concat3 = ConcatLayer([l_forward3, l_backward3], axis=2)

    # --- dense layers ---
    net = lasagne.layers.ReshapeLayer(l_concat3, (batch_size * seq_length, 2 * N_HIDDEN))
    net = DropoutLayer(net, p=0.3)
    net = lasagne.layers.DenseLayer(net, num_units=out_len, nonlinearity=lasagne.nonlinearities.linear)
    net = GradientClipLayer(net, grad_clipping=FF_GRAD_CLIP, nonlinearity=lasagne.nonlinearities.sigmoid)

    net = lasagne.layers.ReshapeLayer(net, (batch_size, seq_length, out_len))

    return net