MIN_LES_TRACK=1000# This is the most important parameter: minimum number of LEs per track to have the track included;
# 100 results in 1,012,961 unique tracks,
# 1000 results in 122,442 unique tracks
MIN_LES_COUNTRY=80000# Minimum number of LEs per country to be considered in the clustering (other suited values: 350000, 120000)
MIN_UNIQUE_TRACKS_COUNTRY=1000# Minimum number of unique tracks per country to be considered in the clustering
MIN_USERS_COUNTRY=25#150 # Minimum number of users per country to be considered in the clustering
# BOOLEAN PARAMETERS TO CONTROL THE STEPS CARRIED OUT BY THE SCRIPT
DO_PREPROCESSING=False# Perform preprocessing (needs to be set to True when running the script for the first time;
# assumes that the raw data is available in path DATA_DIR)
DO_CLUSTERING=True# Conduct clustering experiments (assumes preprocessing has been performed already)
DO_PCA=True# Before clustering, reduce country feature matrix using PCA / Truncated SVD
CLUSTERING_TSNE=True# Perform t-SNE clustering
DO_OPTICS_FOR_TSNE=True# Find and visualize clusters using OPTICS after t-SNE has been created
CLUSTERING_AP=False# Perform clustering via Affinity Propagation
CLUSTERING_DBSCAN=False# Perform clustering via DBSCAN
DO_KMEANS_SILHOUETTE=False# Perform k-means and plot silhouettes for different number of clusters (k)
CREATE_PLOTS=True# Create all plots/visualizations
# PARAMETERS FOR PATHS
DATA_DIR='../data/'#'../data_FiAI2020/' #'../data/' # directory containing the raw data
FEATURE_DIR=DATA_DIR+'LEs_country_vectors_min'+str(MIN_LES_TRACK)+'les/'#_artists # directory containing the preprocessed data (country feature vectors)
VISU_DIR='../visu_FiAI2020/'#'../visu/' # output directory to write results/visualizations to
# PARAMETERS FOR PREPROCESSING AND CLUSTERING ALGORITHMS
PCA_DIMS=100# Number of dimensions the data is reduced to by PCA
TSNE_PERPLEXITY=[1,2,3,5,10,15,20,25,30,35,40,50]# List of perplexities to investigate for t-SNE #[3, 5] #[1, 2, 3, 5] #[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50] #[20, 25, 30]
OPTICS_MIN_CLUSTER_SIZE=3# Minimum number of data points per cluster for OPTICS
# Read file fn containing pairs of tracks and listening events
print('Number of countries with #LE >= %d and #min LEs >=%d and #unique tracks >= %d and #users >= %d: %d'%(MIN_LES_COUNTRY,MIN_LES_TRACK,MIN_UNIQUE_TRACKS_COUNTRY,MIN_USERS_COUNTRY,len(country_ids_include)))
method='T-SNE (perp='+str(perp)+'); PCA dim='+str(PCA_DIMS)+', LE item min='+str(MIN_LES_TRACK)+', LE country min='+str(MIN_LES_COUNTRY)+', users country min='+str(MIN_USERS_COUNTRY)#', unique items-country min=' + str(MIN_UNIQUE_TRACKS_COUNTRY) +