Commit bea8173f authored by Paul Primus's avatar Paul Primus
Browse files

fix issue with different file lengths

parent e71e703c
......@@ -50,19 +50,12 @@ class AudioSet(BaseDataSet):
class_names = sorted([class_name for class_name in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, class_name))])
training_sets = []
data_arrays = []
for class_name in class_names:
training_sets.append(AudioSetClassSubset(class_name, **kwargs))
data = training_sets[-1].data
for i, file in enumerate(data):
data_arrays.append(file)
data_arrays = np.concatenate(data_arrays, axis=-1)
self.training_set = torch.utils.data.ConcatDataset(training_sets)
self.validation_set = None
self.mean = data_arrays.mean(axis=1, keepdims=True)
self.std = data_arrays.std(axis=1, keepdims=True)
del data_arrays
@property
def observation_shape(self) -> tuple:
......@@ -74,9 +67,6 @@ class AudioSet(BaseDataSet):
def validation_data_set(self):
return self.validation_set
def mean_std(self):
return self.mean, self.std
class AudioSetClassSubset(torch.utils.data.Dataset):
......@@ -93,7 +83,7 @@ class AudioSetClassSubset(torch.utils.data.Dataset):
normalize_spec=False,
fmin=0,
hop_all=False,
max_file_per_class=2,
max_file_per_class=10,
max_file_length=350
):
......
......@@ -18,9 +18,3 @@ class BaseDataSet(ABC):
@abstractmethod
def validation_data_set(self):
raise NotImplementedError
@property
@abstractmethod
def mean_std(self):
raise NotImplementedError
......@@ -2,7 +2,6 @@ import os
import torch.utils.data
from dcase2020_task2.data_sets import BaseDataSet, CLASS_MAP, INVERSE_CLASS_MAP, TRAINING_ID_MAP, ALL_ID_MAP
from dcase2020_task2.data_sets import MachineDataSet
import numpy as np
VALID_TYPES = {
......@@ -84,18 +83,11 @@ class ComplementMCMDataSet(BaseDataSet):
training_sets = []
data = []
for type_ in VALID_TYPES[self.valid_types][machine_type]:
for id_ in ALL_ID_MAP[type_]:
if type_ != machine_type or id_ != machine_id:
t = MachineDataSet(type_, id_, mode='training', **kwargs)
data.append(t.data)
training_sets.append(t)
data = np.concatenate(data, axis=-1)
self.mean = data.mean(axis=1, keepdims=True)
self.std = data.std(axis=1, keepdims=True)
del data
self.training_set = torch.utils.data.ConcatDataset(training_sets)
......@@ -109,6 +101,3 @@ class ComplementMCMDataSet(BaseDataSet):
def validation_data_set(self):
raise NotImplementedError
def mean_std(self):
return self.mean, self.std
......@@ -51,17 +51,10 @@ class MCMDataSet(BaseDataSet):
if machine_id == -1:
training_sets = []
validation_sets = []
data = []
for id_ in ALL_ID_MAP[machine_type]:
training_sets.append(MachineDataSet(machine_type, id_, mode='training', **kwargs))
validation_sets.append(MachineDataSet(machine_type, id_, mode='validation', **kwargs))
data.append(training_sets[-1].data)
data = np.concatenate(data, axis=-1)
mean = data.mean(axis=1, keepdims=True)
std = data.std(axis=1, keepdims=True)
del data
training_set = torch.utils.data.ConcatDataset(training_sets)
validation_set = torch.utils.data.ConcatDataset(validation_sets)
......@@ -69,13 +62,8 @@ class MCMDataSet(BaseDataSet):
training_set = MachineDataSet(machine_type, machine_id, mode='training', **kwargs)
validation_set = MachineDataSet(machine_type, machine_id, mode='validation', **kwargs)
mean = training_set.data.mean(axis=1, keepdims=True)
std = training_set.data.std(axis=1, keepdims=True)
self.training_set = training_set
self.validation_set = validation_set
self.mean = mean
self.std = std
@property
def observation_shape(self) -> tuple:
......@@ -87,9 +75,6 @@ class MCMDataSet(BaseDataSet):
def validation_data_set(self):
return self.validation_set
def mean_std(self):
return self.mean, self.std
class MachineDataSet(torch.utils.data.Dataset):
......@@ -154,30 +139,35 @@ class MachineDataSet(torch.utils.data.Dataset):
files = sorted(files)
self.files = files
self.file_length = self.__load_preprocess_file__(files[0]).shape[-1]
self.num_samples_per_file = (self.file_length // self.context) if hop_all else (self.file_length - self.context + 1)
files = sorted(files)
self.files = files
self.meta_data = self.__load_meta_data__(files)
self.data = self.__load_data__(files)
self.index_map = {}
ctr = 0
for i, file in enumerate(self.data):
if hop_all:
residual = file.shape[-1] - context
self.index_map[ctr] = (i, residual)
ctr += 1
else:
for j in range(file.shape[-1] + 1 - context):
self.index_map[ctr] = (i, j)
ctr += 1
self.length = ctr
def __getitem__(self, item):
# get offset in audio file
offset = item % self.num_samples_per_file
# get audio file index
item = item // self.num_samples_per_file
# load audio file and extract audio junk
residual = (self.file_length % self.context) + 1
offset = item * self.file_length + ((offset * self.context + np.random.randint(0, residual)) if self.hop_all else offset)
observation = self.data[:, offset:offset + self.context]
# create data object
meta_data = self.meta_data[item].copy()
file_idx, offset = self.index_map[item]
if self.hop_all:
offset = np.random.randint(0, offset)
observation = self.data[file_idx][:, offset:offset + self.context]
meta_data = self.meta_data[file_idx].copy()
meta_data['observations'] = observation[None]
return meta_data
def __len__(self):
return len(self.files) * self.num_samples_per_file
return self.length
def __load_meta_data__(self, files):
data = []
......@@ -187,7 +177,7 @@ class MachineDataSet(torch.utils.data.Dataset):
return data
def __load_data__(self, files):
file_name = "{}_{}_{}_{}_{}_{}_{}_{}_{}_{}.npy".format(
file_name = "{}_{}_{}_{}_{}_{}_{}_{}_{}_{}.npz".format(
self.num_mel,
self.n_fft,
self.hop_size,
......@@ -201,32 +191,19 @@ class MachineDataSet(torch.utils.data.Dataset):
)
file_path = os.path.join(self.data_root, file_name)
data = []
if os.path.exists(file_path):
print('Loading {} data set for machine type {} id {}...'.format(self.mode, self.machine_type,
self.machine_id))
data = np.load(file_path)
container = np.load(file_path)
data = [container[key] for key in container]
else:
print('Loading & saving {} data set for machine type {} id {}...'.format(self.mode, self.machine_type,
self.machine_id))
data = np.empty((self.num_mel, self.file_length * len(files)), dtype=np.float32)
print('Loading & Saving {} data set for machine type {} id {}...'.format(self.mode, self.machine_type,
self.machine_id))
for i, f in enumerate(files):
file = self.__load_preprocess_file__(f)
if file.shape[1] != self.file_length:
if file.shape[1] < self.file_length:
print(f'Too short: {f}')
file = np.concatenate([
file,
file[:, :self.file_length - file.shape[1]]
], -1)
elif file.shape[1] > self.file_length:
print(f'Too long: {f}')
file = file[:, :self.file_length]
data[:, i * self.file_length:(i + 1) * self.file_length] = file
np.save(file_path, data)
data.append(file)
np.savez(file_path, *data)
return data
def __load_preprocess_file__(self, file):
......@@ -283,7 +260,6 @@ class MachineDataSet(torch.utils.data.Dataset):
}
if __name__ == '__main__':
for type_, id_ in enumerate_development_datasets():
......
......@@ -4,6 +4,7 @@ import torch
from sacred import Experiment
from dcase2020_task2.utils.logger import Logger
import os
import numpy as np
import torch.utils.data
# workaround...
from sacred import SETTINGS
......@@ -42,20 +43,16 @@ class ClassificationExperiment(BaseExperiment, pl.LightningModule):
# **self.objects['fetaure_settings']
# )
if self.objects.get('normalize_dataset') == 'normal':
self.mean = torch.from_numpy(self.normal_data_set.mean)
self.std = torch.from_numpy(self.normal_data_set.std)
elif self.objects.get('normalize_dataset') == 'abnormal':
self.mean = torch.from_numpy(self.abnormal_data_set.mean)
self.std = torch.from_numpy(self.abnormal_data_set.std)
elif self.objects.get('normalize_dataset') == 'average':
self.mean = torch.from_numpy((self.normal_data_set.mean + self.abnormal_data_set.mean) / 2)
# TODO: this is not correct (?)
self.std = torch.from_numpy((self.normal_data_set.std + self.abnormal_data_set.std) / 2)
elif self.objects.get('normalize_dataset') is None:
if self.objects.get('normalize_dataset') is None:
print('No normalization.')
self.mean = torch.zeros(self.normal_data_set.mean.shape)
self.std = torch.ones(self.normal_data_set.std.shape)
elif self.objects.get('normalize_dataset') is 'min_max':
print('Min/Max normalization.')
self.min, self.max = None, None
raise NotImplementedError
elif self.objects.get('normalize_dataset') is 'mean_std':
print('Mean/Std normalization.')
self.mean, self.std = None, None
raise NotImplementedError
else:
raise AttributeError
......@@ -89,8 +86,12 @@ class ClassificationExperiment(BaseExperiment, pl.LightningModule):
return batch
def normalize_batch(self, batch):
device = batch['observations'].device
batch['observations'] = (batch['observations'] - self.mean.to(device)) / self.std.to(device)
if self.objects.get('normalize_dataset') is 'min_max':
assert self.mean is None
batch['observations'] = (((batch['observations'] - self.min) / (self.max - self.min)) - 0.5) * 2
elif self.objects.get('normalize_dataset') is 'mean_std':
assert self.min is None
batch['observations'] = (batch['observations'] - self.mean) / self.std
def training_step(self, batch_normal, batch_num, optimizer_idx=0):
......@@ -200,7 +201,7 @@ def configuration():
else:
num_workers = 4
loss_class = 'dcase2020_task2.losses.AUC'
loss_class = 'dcase2020_task2.losses.BCE'
batch_size = 32
learning_rate = 1e-4
weight_decay = 0
......
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
epochs=100
loss_class=BCE
valid_types=loose
for learning_rate in 1e-4
do
for rf in a_bit_larger normal a_bit_smaller
do
for learning_rate_decay in 0.99 0.98
./scripts/per_id_run_parallel.sh classification_experiment "id=resnet_gridsearch_2_${rf}_${valid_types}_${learning_rate}_${learning_rate_decay}_${epochs}_${loss_class} learning_rate=$learning_rate learning_rate_decay=$learning_rate_decay epochs=$epochs rf=$rf valid_types=$valid_types loss_class=dcase2020_task2.losses.$loss_class -m student2.cp.jku.at:27017:resnet_gridsearch"
done
done
done
epochs=100
loss_class=BCE
valid_types=loose
for learning_rate in 1e-4
do
for rf in a_bit_larger normal a_bit_smaller
do
for learning_rate_decay in 0.97 0.96
./scripts/per_id_run_parallel.sh classification_experiment "id=resnet_gridsearch_2_${rf}_${valid_types}_${learning_rate}_${learning_rate_decay}_${epochs}_${loss_class} learning_rate=$learning_rate learning_rate_decay=$learning_rate_decay epochs=$epochs rf=$rf valid_types=$valid_types loss_class=dcase2020_task2.losses.$loss_class -m student2.cp.jku.at:27017:resnet_gridsearch"
done
done
done
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment