#!/usr/bin/env python import numpy as np import theano import lasagne from lasagne.layers.dnn import Conv2DDNNLayer as Conv2DLayer from lasagne.layers.dnn import batch_norm_dnn as batch_norm from lasagne.layers.dnn import MaxPool2DDNNLayer as MaxPool2DLayer from lasagne.layers import DropoutLayer, FlattenLayer, ReshapeLayer, GRULayer, ConcatLayer from piano_transcription.data.data_pools import BatchIterator from piano_transcription.layers.gradient import GradientClipLayer INI_LEARNING_RATE = np.float32(0.001) BATCH_SIZE = 8 MAX_EPOCHS = 1000 PATIENCE = 6 L2 = None RNN_GRAD_CLIP = 50 FF_GRAD_CLIP = 50 SPEC_BINS = 168 OUT_LEN = 88 SPEC_CONTEXT = 13 STEP_SIZE = 10 SEQ_LENGTH = 100 CENTRAL_TARGET = True N_HIDDEN = 60 init_conv = lasagne.init.HeNormal MAX_PRED_SIZE = 100 USE_BATCHED_PREDICT = True def get_valid_batch_iterator(): def batch_iterator(batch_size, k_samples, shuffle): return BatchIterator(batch_size=batch_size, prepare=prepare, k_samples=k_samples, shuffle=shuffle) return batch_iterator def get_train_batch_iterator(): def batch_iterator(batch_size, k_samples, shuffle): return BatchIterator(batch_size=batch_size, prepare=prepare_train, k_samples=k_samples, shuffle=shuffle) return batch_iterator def predict(pfun, X, max_seq_len, out_len): seq_len, feat_len = X.shape n_batches = int(np.ceil(seq_len / float(MAX_PRED_SIZE))) if USE_BATCHED_PREDICT: padded_len = n_batches * MAX_PRED_SIZE else: padded_len = max_seq_len m_b = np.ones((1, seq_len), dtype=theano.config.floatX) x_b_p = np.pad(X, ((0, padded_len - seq_len), (0, 0)), 'constant') m_b = np.pad(m_b, ((0, 0), (0, padded_len - seq_len)), 'constant') pad_width = (SPEC_CONTEXT - 1) / 2 x_b_p = np.pad(x_b_p, ((pad_width, pad_width), (0, 0)), 'constant') step_size = 1 indices = np.arange(pad_width, x_b_p.shape[0] - pad_width, step_size).astype(np.int) n_seq_pred = len(indices) shape = [1, n_seq_pred, 1, SPEC_CONTEXT, feat_len] X_pred = np.zeros(shape, dtype=theano.config.floatX) for o_idx, x_idx in enumerate(indices): X_pred[0, o_idx, 0, :, :] = x_b_p[x_idx - pad_width:x_idx + pad_width + 1, :] if USE_BATCHED_PREDICT: p_b = np.zeros((seq_len, out_len)) for batch in xrange(n_batches): i0 = batch * MAX_PRED_SIZE i1 = i0 + MAX_PRED_SIZE i1o = min(i1, seq_len) p_b[i0:i1o] = pfun([X_pred[:, i0:i1], m_b[:, i0:i1]])[0, 0:(i1o-i0)] else: p_b = pfun([X_pred, m_b])[0, :seq_len] return p_b def prepare(x, y): x = x[:, :, np.newaxis] y = np.squeeze(y) m = np.ones((x.shape[0], x.shape[1]), dtype=theano.config.floatX) # x batch size, and x feat length -> rnn is after conv and reshape... return x, m, y def prepare_train(x, y): x, m, y = prepare(x, y) return x, m, y def build_eval_model(max_seq_len, feat_len, out_len): if USE_BATCHED_PREDICT: return build_model(batch_size=1, seq_length=MAX_PRED_SIZE, feat_len=feat_len, out_len=out_len) else: return build_model(batch_size=1, seq_length=max_seq_len, feat_len=feat_len, out_len=out_len) def build_model(batch_size=BATCH_SIZE, seq_length=SEQ_LENGTH, feat_len=SPEC_BINS, out_len=OUT_LEN): """ Compile net architecture """ nonlin = lasagne.nonlinearities.rectify # --- input layers --- l_in = lasagne.layers.InputLayer(shape=(batch_size, seq_length, 1, SPEC_CONTEXT, feat_len)) l_mask = lasagne.layers.InputLayer(shape=(batch_size, seq_length)) net = l_in # --- reshape for convolutions --- net = ReshapeLayer(net, shape=(batch_size * seq_length, 1, SPEC_CONTEXT, feat_len)) # --- conv layers --- net = Conv2DLayer(net, num_filters=32, filter_size=3, stride=1, pad=0, W=init_conv(), nonlinearity=nonlin) net = batch_norm(net) net = Conv2DLayer(net, num_filters=32, filter_size=3, stride=1, pad=0, W=init_conv(), nonlinearity=nonlin) net = batch_norm(net) net = MaxPool2DLayer(net, pool_size=(1, 3)) net = DropoutLayer(net, p=0.3) net = Conv2DLayer(net, num_filters=64, filter_size=3, stride=1, pad=0, W=init_conv(), nonlinearity=nonlin) net = batch_norm(net) # net = Conv2DLayer(net, num_filters=64, filter_size=3, stride=1, pad=0, W=init_conv(), nonlinearity=nonlin) # net = batch_norm(net) net = MaxPool2DLayer(net, pool_size=(1, 3)) net = DropoutLayer(net, p=0.3) # --- reshape to rnn format --- net = FlattenLayer(net) net = ReshapeLayer(net, (batch_size, seq_length, -1)) # --- rnn part --- l_forward1 = GRULayer(net, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP) l_backward1 = GRULayer(net, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP, backwards=True) l_concat1 = ConcatLayer([l_forward1, l_backward1], axis=2) l_forward2 = GRULayer(l_concat1, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP) l_backward2 = GRULayer(l_concat1, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP, backwards=True) l_concat2 = ConcatLayer([l_forward2, l_backward2], axis=2) l_forward3 = GRULayer(l_concat2, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP) l_backward3 = GRULayer(l_concat2, N_HIDDEN, mask_input=l_mask, grad_clipping=RNN_GRAD_CLIP, backwards=True) l_concat3 = ConcatLayer([l_forward3, l_backward3], axis=2) # --- dense layers --- net = lasagne.layers.ReshapeLayer(l_concat3, (batch_size * seq_length, 2 * N_HIDDEN)) net = DropoutLayer(net, p=0.3) net = lasagne.layers.DenseLayer(net, num_units=out_len, nonlinearity=lasagne.nonlinearities.linear) net = GradientClipLayer(net, grad_clipping=FF_GRAD_CLIP, nonlinearity=lasagne.nonlinearities.sigmoid) net = lasagne.layers.ReshapeLayer(net, (batch_size, seq_length, out_len)) return net