Commit 2fa436dd authored by Paul Primus's avatar Paul Primus
Browse files

update dataset

parent 5baf14c5
......@@ -11,15 +11,16 @@ class BaseDataSet(ABC):
@property
@abstractmethod
def training_data_set(self, type, id):
def training_data_set(self):
raise NotImplementedError
@property
@abstractmethod
def validation_data_set(self, type, id):
def validation_data_set(self):
raise NotImplementedError
@property
@abstractmethod
def complement_data_set(self, type, id):
def mean_std(self):
raise NotImplementedError
......@@ -3,7 +3,6 @@ import torch.utils.data
import glob
from dcase2020_task2.data_sets import BaseDataSet
import librosa
import sys
import numpy as np
CLASS_MAP = {
......@@ -34,11 +33,20 @@ TRAINING_ID_MAP = {
}
def enumerate_development_datasets():
typ_id = []
for i in range(6):
for j in TRAINING_ID_MAP[i]:
typ_id.append((i, j))
return typ_id
class MCMDataSet(BaseDataSet):
def __init__(
self,
machine_type,
machine_id,
data_root=os.path.join(os.path.expanduser('~'), 'shared', 'dcase2020_task2'),
context=5,
num_mel=128,
......@@ -46,9 +54,8 @@ class MCMDataSet(BaseDataSet):
hop_size=512,
power=1.0,
fmin=40,
normalize='all',
normalize_raw=False,
complement='all'
normalize=None
):
self.data_root = data_root
self.context = context
......@@ -56,147 +63,54 @@ class MCMDataSet(BaseDataSet):
self.n_fft = n_fft
self.hop_size = hop_size
self.power = power
self.complement = complement
self.fmin = fmin
self.normalize = normalize
assert type(machine_type) == int and type(machine_id) == int
kwargs = {
'data_root': self.data_root,
'context': self.context,
'num_mel': self.num_mel,
'n_fft': self.n_fft,
'hop_size': self.hop_size,
'power': power,
'normalize': normalize_raw,
'fmin': fmin
}
self.data_sets = dict()
for machine_type in range(6):
self.data_sets[machine_type] = dict()
for machine_id in TRAINING_ID_MAP[machine_type]:
self.data_sets[machine_type][machine_id] = (
MachineDataSet(
machine_type,
machine_id,
data_root=self.data_root,
mode='training',
context=self.context,
num_mel=self.num_mel,
n_fft=self.n_fft,
hop_size=self.hop_size,
power=power,
normalize=normalize_raw,
fmin=fmin
),
MachineDataSet(
machine_type,
machine_id,
data_root=self.data_root,
mode='validation',
context=self.context,
num_mel=self.num_mel,
n_fft=self.n_fft,
hop_size=self.hop_size,
power=power,
normalize=normalize_raw
)
)
training_set = MachineDataSet(machine_type, machine_id, mode='training', **kwargs)
validation_set = MachineDataSet(machine_type, machine_id, mode='validation', **kwargs)
if normalize == 'all':
data = []
for machine_type in range(6):
for machine_id in TRAINING_ID_MAP[machine_type]:
train, _ = self.data_sets[machine_type][machine_id]
data.append(train.data)
data = np.concatenate(data, axis=1)
mean = data.mean(axis=1, keepdims=True)
std = data.std(axis=1, keepdims=True)
for machine_type in range(6):
for machine_id in TRAINING_ID_MAP[machine_type]:
train, val = self.data_sets[machine_type][machine_id]
train.data = (train.data - mean) / std
val.data = (val.data - mean) / std
elif normalize == 'per_machine_id':
for machine_type in range(6):
for machine_id in TRAINING_ID_MAP[machine_type]:
train, val = self.data_sets[machine_type][machine_id]
data = train.data
mean = data.mean(axis=1, keepdims=True)
std = data.std(axis=1, keepdims=True)
train.data = (train.data - mean) / std
val.data = (val.data - mean) / std
elif normalize == 'none':
pass
if normalize is None:
mean = training_set.data.mean(axis=1, keepdims=True)
std = training_set.data.std(axis=1, keepdims=True)
training_set.data = (training_set.data - mean) / std
validation_set.data = (validation_set.data - mean) / std
else:
raise AttributeError
assert type(normalize) == tuple
assert len(normalize) == 2
mean, std = normalize
training_set.data = (training_set.data - mean) / std
validation_set.data = (validation_set.data - mean) / std
self.training_set = training_set
self.validation_set = validation_set
self.mean = mean
self.std = std
@property
def observation_shape(self) -> tuple:
return 1, self.num_mel, self.context
def training_data_set(self, type, id):
return self.data_sets[type][id][0]
def validation_data_set(self, type, id):
return self.data_sets[type][id][1]
def complement_data_set(self, type, id):
complement_sets = []
if self.complement == 'all':
for machine_type in range(6):
for machine_id in TRAINING_ID_MAP[machine_type]:
if machine_type != type or machine_id != id:
complement_sets.append(self.data_sets[machine_type][machine_id][0])
elif self.complement == 'same_mic_diff_type':
if type in [3, 4]:
types = [3, 4]
else:
types = [0, 1, 2, 5]
for machine_type in types:
for machine_id in TRAINING_ID_MAP[machine_type]:
if machine_type != type:
complement_sets.append(self.data_sets[machine_type][machine_id][0])
elif self.complement == 'same_mic':
if type in [3, 4]:
types = [3, 4]
else:
types = [0, 1, 2, 5]
for machine_type in types:
for machine_id in TRAINING_ID_MAP[machine_type]:
if machine_type != type or machine_id != id:
complement_sets.append(self.data_sets[machine_type][machine_id][0])
elif self.complement == 'same_type':
for machine_id in TRAINING_ID_MAP[type]:
if machine_id != id:
complement_sets.append(self.data_sets[type][machine_id][0])
elif self.complement == 'different_type':
for machine_type in range(6):
if machine_type != type:
for machine_id in TRAINING_ID_MAP[type]:
complement_sets.append(self.data_sets[type][machine_id][0])
return torch.utils.data.ConcatDataset(complement_sets)
def get_whole_training_data_set(self):
complement_sets = []
for machine_type in range(6):
for machine_id in TRAINING_ID_MAP[machine_type]:
complement_sets.append(self.data_sets[machine_type][machine_id][0])
return torch.utils.data.ConcatDataset(complement_sets)
def get_whole_validation_data_set(self):
complement_sets = []
for machine_type in range(6):
for machine_id in TRAINING_ID_MAP[machine_type]:
complement_sets.append(self.data_sets[machine_type][machine_id][1])
return torch.utils.data.ConcatDataset(complement_sets)
def get_machine_training_data_set(self, machine_type):
complement_sets = []
for machine_id in TRAINING_ID_MAP[machine_type]:
complement_sets.append(self.data_sets[machine_type][machine_id][0])
return torch.utils.data.ConcatDataset(complement_sets)
def get_machine_validation_data_set(self, machine_type):
complement_sets = []
for machine_id in TRAINING_ID_MAP[machine_type]:
complement_sets.append(self.data_sets[machine_type][machine_id][1])
return torch.utils.data.ConcatDataset(complement_sets)
def training_data_set(self):
return self.training_set
def validation_data_set(self):
return self.validation_set
def mean_std(self):
return self.mean, self.std
class MachineDataSet(torch.utils.data.Dataset):
......@@ -222,7 +136,7 @@ class MachineDataSet(torch.utils.data.Dataset):
self.n_fft = n_fft
self.hop_size = hop_size
self.power = power
self.normalize=normalize
self.normalize = normalize
self.mode = mode
self.data_root = data_root
self.context = context
......@@ -294,10 +208,12 @@ class MachineDataSet(torch.utils.data.Dataset):
file_path = os.path.join(self.data_root, file_name)
if os.path.exists(file_path):
print('Loading {} data set for machine type {} id {}...'.format(self.mode, self.machine_type, self.machine_id))
print('Loading {} data set for machine type {} id {}...'.format(self.mode, self.machine_type,
self.machine_id))
data = np.load(file_path)
else:
print('Loading & saving {} data set for machine type {} id {}...'.format(self.mode, self.machine_type, self.machine_id))
print('Loading & saving {} data set for machine type {} id {}...'.format(self.mode, self.machine_type,
self.machine_id))
data = np.empty((self.num_mel, self.file_length * len(files)), dtype=np.float32)
for i, f in enumerate(files):
data[:, i * self.file_length:(i + 1) * self.file_length] = self.__load_preprocess_file__(f)
......
......@@ -76,6 +76,7 @@ class BaselineDCASEExperiment(BaseExperiment, pl.LightningModule):
return self.result
def train_dataloader(self):
assert False, 'Need to merge training sets frst!'
dl = torch.utils.data.DataLoader(
self.objects['data_set'].get_machine_training_data_set(self.machine_type),
batch_size=self.objects['batch_size'],
......@@ -86,6 +87,7 @@ class BaselineDCASEExperiment(BaseExperiment, pl.LightningModule):
return dl
def val_dataloader(self):
assert False, 'Need to merge training sets frst!'
dl = torch.utils.data.DataLoader(
self.objects['data_set'].get_machine_validation_data_set(self.machine_type),
batch_size=self.objects['batch_size'],
......@@ -154,6 +156,10 @@ def configuration():
data_set = {
'class': 'dcase2020_task2.data_sets.MCMDataSet',
'args': [
machine_type,
machine_id
],
'kwargs': {
'context': context,
'num_mel': num_mel,
......
......@@ -12,7 +12,7 @@ SETTINGS['CAPTURE_MODE'] = 'sys'
class BaselineExperiment(BaseExperiment, pl.LightningModule):
'''
DCASE Baseline with AE per machine ID.
DCASE Baseline with AE, MADMOG & MAF per machine ID.
'''
def __init__(self, configuration_dict, _run):
......@@ -83,12 +83,14 @@ def configuration():
seed = 1220
deterministic = False
id = datetime.now().strftime("%Y-%m-%d_%H:%M:%S:%f")
log_path = os.path.join('..', 'experiment_logs', id)
log_path = os.path.join('experiment_logs', id)
#####################
# quick configuration, uses default parameters of more detailed configuration
#####################
architecture = 'dcase2020_task2.models.MADE'
machine_type = 0
machine_id = 2
......@@ -96,46 +98,49 @@ def configuration():
debug = False
if debug:
epochs = 50
epochs = 1
num_workers = 0
else:
epochs = 100
epochs = 50
num_workers = 4
learning_rate = 1e-3
weight_decay = 1e-5
normalize = 'per_machine_id' #
normalize_raw = True
context = 5
descriptor = "BaselineExperiment_{}_{}_{}_{}_{}_{}".format(
descriptor = "BaselineExperiment_{}_{}_{}_{}_{}_{}_{}".format(
architecture,
batch_size,
learning_rate,
weight_decay,
normalize,
normalize_raw,
context
context,
seed
)
########################
# detailed configuration
########################
num_mel = 256
num_mel = 128
n_fft = 1024
hop_size = 512
power = 1.0
power = 2.0
fmin = 0
data_set = {
'class': 'dcase2020_task2.data_sets.MCMDataSet',
'args': [
machine_type,
machine_id
],
'kwargs': {
'context': context,
'num_mel': num_mel,
'n_fft': n_fft,
'hop_size': hop_size,
'normalize': normalize,
'normalize_raw': normalize_raw,
'power': power,
'fmin': fmin
......@@ -153,13 +158,13 @@ def configuration():
}
model = {
'class': 'dcase2020_task2.models.MADE',
'class': architecture,
'args': [
'@data_set.observation_shape',
'@reconstruction'
],
'kwargs': {
'hidden_size': 4096,
'hidden_size': 1024,
'num_hidden': 4
}
}
......
......@@ -64,7 +64,7 @@ class BaseExperiment(ABC, torch.nn.Module):
def train_dataloader(self):
dl = torch.utils.data.DataLoader(
self.objects['data_set'].training_data_set(self.machine_type, self.machine_id),
self.objects['data_set'].training_data_set(),
batch_size=self.objects['batch_size'],
shuffle=True,
num_workers=self.objects['num_workers'],
......@@ -74,7 +74,7 @@ class BaseExperiment(ABC, torch.nn.Module):
def val_dataloader(self):
dl = torch.utils.data.DataLoader(
self.objects['data_set'].validation_data_set(self.machine_type, self.machine_id),
self.objects['data_set'].validation_data_set(),
batch_size=self.objects['batch_size'],
shuffle=False,
num_workers=self.objects['num_workers']
......@@ -83,7 +83,7 @@ class BaseExperiment(ABC, torch.nn.Module):
def test_dataloader(self):
dl = torch.utils.data.DataLoader(
self.objects['data_set'].get_whole_validation_data_set(),
self.objects['data_set'].validation_data_set(),
batch_size=self.objects['batch_size'],
shuffle=False,
num_workers=self.objects['num_workers']
......
......@@ -4,8 +4,9 @@ import torch.nn.functional as F
class MSEReconstruction(BaseReconstruction):
def __init__(self, weight=1.0, size_average=True, **kwargs):
def __init__(self, input_shape, weight=1.0, size_average=True, **kwargs):
super().__init__()
self.input_shape = input_shape
self.weight = weight
self.size_average = size_average
......
from dcase2020_task2.models.base_model import ClassifierBase, VAEBase
from dcase2020_task2.models.made import MADE
\ No newline at end of file
from dcase2020_task2.models.made import MADE
from dcase2020_task2.models.ae import AE
\ No newline at end of file
import torch.nn
from dcase2020_task2.models import VAEBase
from dcase2020_task2.models import VAEBase#
from dcase2020_task2.priors import NoPrior
import numpy as np
import torch
from dcase2020_task2.models.custom import activation_dict, init_weights
class BaselineFCAE(torch.nn.Module, VAEBase):
class AE(torch.nn.Module, VAEBase):
def __init__(
self,
input_shape,
prior,
reconstruction_loss,
prior=NoPrior(latent_size=8),
hidden_size=128,
num_hidden=3,
activation='relu',
......
......@@ -201,7 +201,7 @@ class Logger:
for i, machine_type in enumerate(unique_machine_types):
machine_type = INVERSE_CLASS_MAP[machine_type]
for machine_id in unique_machine_ids[i]:
result.setdefault(machine_type, dict())[machine_id] = self.__rauc_from_score__(
result.setdefault(machine_type, dict())[int(machine_id)] = self.__rauc_from_score__(
scores_mean,
scores_max,
ground_truth,
......@@ -220,10 +220,10 @@ class Logger:
scores_mean_ = scores_mean[np.logical_and(machine_types == machine_type, machine_ids == id)]
scores_max_ = scores_max[np.logical_and(machine_types == machine_type, machine_ids == id)]
return metrics.roc_auc_score(ground_truth_, scores_mean_), \
metrics.roc_auc_score(ground_truth_, scores_mean_, max_fpr=max_fpr), \
metrics.roc_auc_score(ground_truth_, scores_max_), \
metrics.roc_auc_score(ground_truth_, scores_max_, max_fpr=max_fpr)
return float(metrics.roc_auc_score(ground_truth_, scores_mean_)), \
float(metrics.roc_auc_score(ground_truth_, scores_mean_, max_fpr=max_fpr)), \
float(metrics.roc_auc_score(ground_truth_, scores_max_)), \
float(metrics.roc_auc_score(ground_truth_, scores_max_, max_fpr=max_fpr))
@staticmethod
def __batches_to_per_file_scores__(outputs, aggregation_fun=None):
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment