Commit d28ca3ef authored by Markus Schedl's avatar Markus Schedl
Browse files

.

parent 04a9b46f
#
# This is a script to cluster countries according to the music listening preferences and behaviors of their inhabitants,
# using the LFM-1b dataset of listening events (LEs) created by Last.fm users.
#
# This code accompanies the following journal article. Please consider reading it for background information and to
# obtain better insights into the research behind.
#
# Markus Schedl, Christine Bauer, Wolfgang Reisinger, Dominik Kowald, and Elisabeth Lex
# Listener Modeling and Context-aware Music Recommendation Based on Country Archetypes
# Frontiers in Artificial Intelligence - Machine Learning and Artificial Intelligence, 2020
#
__author__ = 'Markus Schedl'
import numpy as np
import os
import glob
import collections
import pickle as pkl
import matplotlib.pyplot as plt
import matplotlib.cm as cm
from matplotlib import gridspec
from sklearn.cluster import AffinityPropagation
from sklearn.cluster import OPTICS
from sklearn.cluster import DBSCAN
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD, PCA
import sklearn.metrics.pairwise as pair_dist
from sklearn import metrics
from scipy import sparse
from itertools import cycle
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_samples, silhouette_score
# GENERAL PARAMETERS USED FOR DATA FILTERING
MIN_LES_TRACK = 1000 # This is the most important parameter: minimum number of LEs per track to have the track included;
# 100 results in 1,012,961 unique tracks,
# 1000 results in 122,442 unique tracks
MIN_LES_COUNTRY = 80000 # Minimum number of LEs per country to be considered in the clustering (other suited values: 350000, 120000)
MIN_UNIQUE_TRACKS_COUNTRY = 1000 # Minimum number of unique tracks per country to be considered in the clustering
MIN_USERS_COUNTRY = 25 #150 # Minimum number of users per country to be considered in the clustering
# BOOLEAN PARAMETERS TO CONTROL THE STEPS CARRIED OUT BY THE SCRIPT
DO_PREPROCESSING = False # Perform preprocessing (needs to be set to True when running the script for the first time;
# assumes that the raw data is available in path DATA_DIR)
DO_CLUSTERING = True # Conduct clustering experiments (assumes preprocessing has been performed already)
DO_PCA = True # Before clustering, reduce country feature matrix using PCA / Truncated SVD
CLUSTERING_TSNE = True # Perform t-SNE clustering
DO_OPTICS_FOR_TSNE = True # Find and visualize clusters using OPTICS after t-SNE has been created
CLUSTERING_AP = False # Perform clustering via Affinity Propagation
CLUSTERING_DBSCAN = False # Perform clustering via DBSCAN
DO_KMEANS_SILHOUETTE = False # Perform k-means and plot silhouettes for different number of clusters (k)
CREATE_PLOTS = True # Create all plots/visualizations
# PARAMETERS FOR PATHS
DATA_DIR = '../data/' #'../data_FiAI2020/' #'../data/' # directory containing the raw data
FEATURE_DIR = DATA_DIR + 'LEs_country_vectors_min' + str(MIN_LES_TRACK) + 'les/' #_artists # directory containing the preprocessed data (country feature vectors)
VISU_DIR = '../visu_FiAI2020/' #'../visu/' # output directory to write results/visualizations to
# PARAMETERS FOR PREPROCESSING AND CLUSTERING ALGORITHMS
PCA_DIMS = 100 # Number of dimensions the data is reduced to by PCA
TSNE_PERPLEXITY = [1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50] # List of perplexities to investigate for t-SNE #[3, 5] #[1, 2, 3, 5] #[1, 2, 3, 5, 10, 15, 20, 25, 30, 35, 40, 50] #[20, 25, 30]
OPTICS_MIN_CLUSTER_SIZE = 3 # Minimum number of data points per cluster for OPTICS
# Read file fn containing pairs of tracks and listening events
def read_tracks_le(fn):
track_le = {}
# no_lines = sum(1 for line in open(fn, 'r'))
# no_lines = len(open(fn).read().splitlines())
# print(no_lines)
counter = 0
with open(fn, 'r') as f:
for line in f:
content = line.strip().split('\t')
if len(content) == 2:
track_le[np.int32(content[0])] = np.int32(content[1])
else:
print('Problem encountered with ', content)
if np.mod(counter, 100000) == 0:
print('%d lines read from %s' % (counter, fn))
counter += 1
return collections.OrderedDict(track_le)
# Read file fn that contains the number of users for each country
def read_country_number_users(fn):
country_users = {}
counter = 0
with open(fn, 'r') as f:
for line in f:
content = line.strip().split('\t')
if len(content) == 2:
country = content[0]
# Parse lines
uid_les_all = content[1].split(';')
# print('Number of users for country %s: %d' % (content[0], len(uid_les_all)))
country_users[country] = len(uid_les_all)-1
for idx_uid_les in range(0, len(uid_les_all) - 1): # All users in current country
uid_les = uid_les_all[idx_uid_les]
# print(uid_les)
uid_les_split = uid_les.split(' ')
uid = np.int32(uid_les_split[0])
les = np.int32(uid_les_split[1])
# print(uid, les)
else:
print('Problem encountered with ', content)
if np.mod(counter, 100000) == 0:
print('%d lines read from %s' % (counter, fn))
counter += 1
return collections.OrderedDict(country_users)
# Read file fn containing country, tracks, and LEs; and filter data using the list of track-ids in filter_tids
def read_filter_country_tracks_le(fn, filter_tids):
country_track_le = {}
with open(fn, 'r') as f:
for line in f: # For all countries
# Initialize country-specific feature vector (containing LEs)
country_feature_vec = np.zeros([len(filter_tids)], dtype=np.int32)
# Start parsing
content = line.strip().split('\t')
if len(content) == 2:
cty = content[0]
# Parse lines
tid_les_all = content[1].split(';')
print('Number of tracks for country %s: %d' % (content[0], len(tid_les_all)))
t_counter = 0
for idx_tid_les in range(0, len(tid_les_all)-1):
tid_les = tid_les_all[idx_tid_les]
#print(tid_les)
tid_les_split = tid_les.split(' ')
tid = np.int32(tid_les_split[0])
les = np.int32(tid_les_split[1])
# Filter using track-ids in filter_tids
if tid in t_ids_reduced: # Only keep entries whose track-id is in filtered track-id list
# Output track-id, country-wise, and global LEs
idx_tid = np.where(tid == t_ids_reduced)
#print('t_id: %d \t le_country: %d \t le_global: %d' % (tid, les, t_les_reduced[idx_tid]))
country_feature_vec[idx_tid] = les
if np.mod(t_counter, 10000) == 0:
print('%d tracks processed of country %s' % (t_counter, cty))
t_counter += 1
# Store country feature vector into dictionary
country_track_le[cty] = country_feature_vec
print('Number of tracks for country %s after filtering: %d' % (cty, len(np.where(country_feature_vec > 0)[0])))
else:
print('Problem encountered with ', content)
return country_track_le
# Creates visualization of clustering
# Y: coordinates of data points (as output of t-SNE, for instance)
# method: used as title of the created plot
# labels: labels to show next to each data point (here: country labels)
# datapoint_sizes: list of sizes for each data point (if omitted, a fixed size is used)
# colors: list of colors for each data point (if omitted, all points are plotted in blue)
def plot_mapping(Y, method, labels = [], datapoint_sizes = [], colors = []):
tsne_datapoint_size = 40 # fixed data point size if no information in datapoint_sizes is provided
tsne_datapoint_size_min = 8
tsne_datapoint_size_max = 40
# Create figure
plt.figure(figsize=(11.69, 8.27))
# If data point size is given -> scale values
if len(datapoint_sizes) > 0:
dp_sizes = datapoint_sizes
dp_max = np.max(datapoint_sizes)
dp_sizes = tsne_datapoint_size_min + ((datapoint_sizes / dp_max) * (tsne_datapoint_size_max - tsne_datapoint_size_min))
else: # If not, use fixed size for all data points
dp_sizes = np.repeat(tsne_datapoint_size, len(labels))
# Plot results
if len(labels) > 0: # If we have labels, print data items with same label in same color
# Iterate over all classes and plot respective items with corresponding class color
# Print text / country labels
for y_idx in range(0, len(labels)):
if len(colors) == len(labels): # if color mapping provided, use it
color_cur = colors[y_idx]
else:
color_cur = 'b'
plt.scatter(Y[y_idx, 0], Y[y_idx, 1], color=color_cur, s=dp_sizes[y_idx]) # Add scatter plot
plt.text(Y[y_idx, 0], Y[y_idx, 1], labels[y_idx], fontsize=dp_sizes[y_idx], color=color_cur) # colors_defined[labels[y_idx]])
else: # No labels -> all data points in same color
plt.scatter(Y[:, 0], Y[:, 1], color='b')
#OR: plt.plot(Y[:, 0], Y[:, 1], 'o', color='r')
# Add title and legend
plt.title(method)
#plt.legend(fontsize=8)
# Save as PDF (without axes and grid)
plt.axis('off')
plt.grid(False)
plt.tight_layout()
plt.savefig(VISU_DIR + method + '.pdf', format='pdf', dpi=300)
plt.close()
# plt.show()
# Plot silhouette to determine the number of clusters for clustering algorithms (k-means)
# X: input data
# method: string used in file name for plot
# range_n_clusters: rang of number of clusters to evaluate
def plot_silhouette(X, method = [], range_n_clusters = [2, 3, 4, 5, 6]):
# Code from https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html
for n_clusters in range_n_clusters:
# Create a subplot with 1 row and 2 columns
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_size_inches(18, 7)
# The 1st subplot is the silhouette plot
# The silhouette coefficient can range from -1, 1 but in this example all
# lie within [-0.1, 1]
ax1.set_xlim([-0.1, 1])
# The (n_clusters+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
# Initialize the clusterer with n_clusters value and a random generator
# seed of 10 for reproducibility.
clusterer = KMeans(n_clusters=n_clusters, random_state=10)
cluster_labels = clusterer.fit_predict(X)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed
# clusters
silhouette_avg = silhouette_score(X, cluster_labels)
print("For n_clusters =", n_clusters,
"The average silhouette_score is :", silhouette_avg)
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(X, cluster_labels)
y_lower = 10
for i in range(n_clusters):
# Aggregate the silhouette scores for samples belonging to
# cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
color = cm.nipy_spectral(float(i) / n_clusters)
ax1.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
ax1.set_title("The silhouette plot for the various clusters.")
ax1.set_xlabel("The silhouette coefficient values")
ax1.set_ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
ax1.axvline(x=silhouette_avg, color="red", linestyle="--")
ax1.set_yticks([]) # Clear the yaxis labels / ticks
ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])
# 2nd Plot showing the actual clusters formed
colors = cm.nipy_spectral(cluster_labels.astype(float) / n_clusters)
ax2.scatter(X[:, 0], X[:, 1], marker='.', s=30, lw=0, alpha=0.7,
c=colors, edgecolor='k')
# Labeling the clusters
centers = clusterer.cluster_centers_
# Draw white circles at cluster centers
ax2.scatter(centers[:, 0], centers[:, 1], marker='o',
c="white", alpha=1, s=200, edgecolor='k')
for i, c in enumerate(centers):
ax2.scatter(c[0], c[1], marker='$%d$' % i, alpha=1,
s=50, edgecolor='k')
ax2.set_title("The visualization of the clustered data.")
ax2.set_xlabel("Feature space for the 1st feature")
ax2.set_ylabel("Feature space for the 2nd feature")
plt.suptitle(("Silhouette analysis for KMeans clustering on sample data "
"with n_clusters = %d" % n_clusters),
fontsize=14, fontweight='bold')
plt.savefig(VISU_DIR + 'kmeans-silhouette (k=' + str(n_clusters) +'); ' + method + '.jpg', format='jpg', dpi=300)
plt.close()
#plt.show()
# Compute neighborhood preservation ratio (NPR)
# P: proximity matrix (input space)
# D: distance matrix (output space)
# nn: number of neareast neighbors to consider
def compute_NPR(P, D, nn):
# Return -1 if size of P and D is different
if P.shape != D.shape:
return -1
npr = 0
# For all data points
for i in range(0, P.shape[0]):
idx_sorted_P = np.argsort(P[i])
idx_sorted_D = np.argsort(D[i])
nn_P = idx_sorted_P[1:nn+1] # Get nn nearest items to P_i
nn_D = idx_sorted_D[1:nn+1] # Get nn nearest items to D_i
# Compute NPR for current data point
npr_i = len(np.intersect1d(nn_P, nn_D)) / np.float32(nn)
npr = npr + npr_i / P.shape[0]
return npr
# Main script
if __name__ == '__main__':
np.set_printoptions(precision=3, suppress=True)
# Perform data preprocessing
#
# Assumes that the following files are present in DATA_DIR:
# country_dist.txt: country_code \t no_les
# country_track_dist.txt: country_code \t track_id SPACE no_les ; track_id SPACE no_les ; ...
# country_user_dist.txt: country_code \t user_id SPACE no_les ; user_id SPACE no_les ; ...
#
# The following files are optional:
# user_dist.txt: user_id \t no_les
# track_dist.txt: track_id \t no_les
# user_track_dist.txt: user_id \t track_id SPACE no_les ; track_id SPACE no_les ; ...
if DO_PREPROCESSING:
# Load or create tracks and LEs file
if os.path.exists(DATA_DIR + 't_ids.pkl'): # a_
print('Loading tracks and LEs from pickle files')
with open(DATA_DIR + 't_ids.pkl', 'rb') as fo: # a_
t_ids = pkl.load(fo)
with open(DATA_DIR + 't_les.pkl', 'rb') as fo: # a_
t_les = pkl.load(fo)
else:
track_le = read_tracks_le(DATA_DIR + 'track_dist.txt')
# track_le = read_tracks_le(DATA_DIR + 'artist_dist.txt')
print('Tracks read: %d' % len(track_le))
t_ids = np.array(list(track_le.keys()))
t_les = np.array(list(track_le.values()))
# ...and dump as pkl file
with open(DATA_DIR + 't_ids.pkl', 'wb') as fo: # a_
pkl.dump(t_ids, fo)
with open(DATA_DIR + 't_les.pkl', 'wb') as fo: # a_
pkl.dump(t_les, fo)
# Plot distribution of LEs per user
if CREATE_PLOTS:
print('Creating plot of track LE distribution')
handle_tpc_all, = plt.semilogy(range(len(t_les)), t_les, marker='.', color='b', markersize=1.0, linestyle='None')
plt.setp(handle_tpc_all, linewidth=.5)
plt.title('Distribution of listening events', fontsize=18)
plt.xlabel('Artist ID', fontsize=14) # Track ID
plt.ylabel('Number of listening events', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig(VISU_DIR + 'plot_artist_les.jpg', format='jpg', dpi=300)
#plt.savefig(VISU_DIR + 'plot_track_les.pdf', format='pdf', dpi=300)
plt.close()
# Create subset of tracks according to threshold MIN_LES_TRACK
print('Creating subset of tracks with LE >= %d' % MIN_LES_TRACK)
idx_min_les = np.where(t_les >= MIN_LES_TRACK)
t_les_reduced = t_les[idx_min_les]
t_ids_reduced = t_ids[idx_min_les]
print('No. tracks after filtering: %d' % len(t_les_reduced))
# Dump reduced ones as pkl files
with open(DATA_DIR + 't_ids_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo: # a_
pkl.dump(t_ids_reduced, fo)
with open(DATA_DIR + 't_les_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo: # a_
pkl.dump(t_les_reduced, fo)
# print(t_ids_reduced)
# print(t_les_reduced)
# Read country track distribution file and filter with track-ids in subset (MIN_LES_TRACK)
country_track_le = read_filter_country_tracks_le(DATA_DIR + 'country_track_dist.txt', t_ids_reduced) # country_track_dist.txt # return dict (country, feature vector)
# Store feature vector for each country in pkl file
for cty, feat_vec in country_track_le.items():
print('Storing feature vector of country %s; sum of LEs: %d' % (cty, np.sum(feat_vec)))
with open(DATA_DIR + 'les_' + cty + '_min' + str(MIN_LES_TRACK) + 'les.pkl', 'wb') as fo:
pkl.dump(feat_vec, fo)
# exit()
# Read number of users for each country (for filtering)
cty_numusers = read_country_number_users(DATA_DIR + '/country_user_dist.txt')
#
# Perform clustering experiments
#
if DO_CLUSTERING:
# Load country feature vectors
feature_files = [f for f in glob.glob(FEATURE_DIR + "/*.pkl", recursive=True)]
feature_vecs = {}
# print('Loading tracks and LEs from pickle files')
# print(FEATURE_DIR)
country_labels = []
country_features_list = []
for f in feature_files:
print('Loading tracks and LEs from pickle file %s' % f)
with open(f, 'rb') as fi:
cty = os.path.basename(os.path.normpath(f)).split('_')[1] # extract country name from file name
country_labels.append(cty)
country_features_list.append(pkl.load(fi))
print('Feature vector size: %d' % len(country_features_list[0]))
### Iterate through all elements in feature_vecs, normalize, and write to numpy array
# Only include countries that fulfill certain criteria:
# minimum number of LEs
# minimum number of unique tracks
# minimum number of users/listeners
country_ids_include = [] # to hold country ids fulfilling this
overall_le_sum = 0 # total sum of LEs of all countries
overall_le_sum_included_country = 0 # total sum of LEs of included countries
for c in range(0, len(country_features_list)):
le_sum = np.sum(country_features_list[c])
uniq_tracks = np.sum(np.sign(country_features_list[c]))
num_users = cty_numusers[country_labels[c]]
overall_le_sum += le_sum
if le_sum >= MIN_LES_COUNTRY and uniq_tracks >= MIN_UNIQUE_TRACKS_COUNTRY and num_users >= MIN_USERS_COUNTRY:
print('Included country: %s; Users: %d; LE-sum: %d; unique tracks: %d' % (country_labels[c], num_users, le_sum, uniq_tracks))
country_ids_include.append(c)
overall_le_sum_included_country += le_sum
else:
print('Excluded country: %s; Users: %d; LE-sum: %d; unique tracks: %d' % (country_labels[c], num_users, le_sum, uniq_tracks))
print('Number of countries with #LE >= %d and #min LEs >=%d and #unique tracks >= %d and #users >= %d: %d' % (MIN_LES_COUNTRY, MIN_LES_TRACK, MIN_UNIQUE_TRACKS_COUNTRY, MIN_USERS_COUNTRY, len(country_ids_include)))
print('#LE: %d (all countries); %d, %.6f (included countries)' % (overall_le_sum, overall_le_sum_included_country, np.float32(overall_le_sum_included_country)/np.float32(overall_le_sum) ))
# exit()
# Init feature vector matrix
country_features = np.zeros([len(country_ids_include), len(country_features_list[0])], dtype=np.float32)
country_le_sums = np.zeros([len(country_ids_include)], dtype=np.float32)
#country_features = np.zeros([len(country_features_list), len(country_features_list[0])], dtype=np.float32)
#country_le_sums = np.zeros([len(country_features_list)], dtype=np.float32)
country_labels_full = country_labels
country_labels = []
for c in range(0, len(country_ids_include)):
le_sum = np.sum(country_features_list[country_ids_include[c]])
country_le_sums[c] = le_sum
# Construct features and normalize
country_features[c, :] = country_features_list[country_ids_include[c]] / le_sum
country_labels.append(country_labels_full[country_ids_include[c]])
#print(country_features.shape)
for i in range(0, len(country_labels)):
print(country_labels[i], country_features[i], np.sum(country_features[i]))
# Plot distribution of LEs per country
if CREATE_PLOTS:
print('Creating plot of country LE distribution')
handle_tpc_all, = plt.semilogy(range(len(country_le_sums)), np.sort(country_le_sums)[::-1], marker='.', color='b', markersize=1.0, linestyle='None')
plt.setp(handle_tpc_all, linewidth=.5)
plt.title('Distribution of listening events', fontsize=18)
plt.xlabel('Country ID', fontsize=14)
plt.ylabel('Number of listening events', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig(VISU_DIR + 'plot_country_les.jpg', format='jpg', dpi=300)
#plt.savefig(VISU_DIR + 'plot_country_les.pdf', format='pdf', dpi=300)
plt.close()
# Plot distribution of users per country
if CREATE_PLOTS:
print('Creating plot of country user distribution')
country_numusers = np.fromiter(cty_numusers.values(), dtype=np.int32)
handle_tpc_all, = plt.semilogy(range(len(country_numusers)), np.sort(country_numusers)[::-1], marker='.', color='b', markersize=1.0, linestyle='None')
print(np.sort(country_numusers)[::-1])
plt.setp(handle_tpc_all, linewidth=.5)
plt.title('Distribution of users', fontsize=18)
plt.xlabel('Country ID', fontsize=14)
plt.ylabel('Number of users', fontsize=14)
plt.grid(True)
plt.tight_layout()
plt.savefig(VISU_DIR + 'plot_country_users.jpg', format='jpg', dpi=300)
#plt.savefig(VISU_DIR + 'plot_country_les.pdf', format='pdf', dpi=300)
plt.close()
# Perform PCA if demanded
if DO_PCA:
print('Performing PCA / Truncated SVD to reduce number of dimensions to %d' % PCA_DIMS)
svd = TruncatedSVD(n_components=PCA_DIMS, n_iter=10, random_state=169)
#svd = PCA(n_components=PCA_DIMS, random_state=169)
X = svd.fit_transform(country_features)
print('Explained variance: ', svd.explained_variance_ratio_)
print('Sum of explained variance: ', svd.explained_variance_ratio_.sum())
else:
X = country_features
### Perform t-SNE clustering
if CLUSTERING_TSNE:
for perp in TSNE_PERPLEXITY:
tsne = TSNE(n_components=2, verbose=0, perplexity=perp, early_exaggeration=12.0, learning_rate=200.0,
n_iter=1000, n_iter_without_progress=300, min_grad_norm=1e-07, metric='cosine',
init = 'pca', random_state = 169, method = 'barnes_hut', angle = 0.5)
# init = 'random', random_state = 169, method = 'barnes_hut', angle = 0.5)
Y = tsne.fit_transform(X)
method = 'T-SNE (perp=' + str(perp) + '); PCA dim=' + str(PCA_DIMS) + ', LE item min=' + str(MIN_LES_TRACK) + ', LE country min=' + str(MIN_LES_COUNTRY) + ', users country min=' + str(MIN_USERS_COUNTRY) #', unique items-country min=' + str(MIN_UNIQUE_TRACKS_COUNTRY) +
if CREATE_PLOTS:
plot_mapping(Y, method, country_labels, country_le_sums)
# Compute neighborhood preservation ratio
for nn in [1, 3, 5, 10, 25, 50, perp]: #, 100
print('t-SNE (perplexity=%d), NPR(nn=%d): %.3f%%' % (perp, nn, compute_NPR(pair_dist.cosine_distances(X), pair_dist.cosine_distances(Y), nn)*100.0))
# Plot silhouette for different k's in k-means
if DO_KMEANS_SILHOUETTE:
if CREATE_PLOTS:
plot_silhouette(Y, method, [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15])
# Perform OPTICS on top of t-SNE
if DO_OPTICS_FOR_TSNE:
optics = OPTICS(min_samples=OPTICS_MIN_CLUSTER_SIZE, max_eps=np.inf, metric='euclidean').fit(Y) #, p=2, metric_params=None, cluster_method=’xi’, eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm=’auto’, leaf_size=30, n_jobs=None)[
#optics = OPTICS(min_samples=OPTICS_MIN_CLUSTER_SIZE, max_eps=np.inf, metric='cosine').fit(Y) #, p=2, metric_params=None, cluster_method=’xi’, eps=None, xi=0.05, predecessor_correction=True, min_cluster_size=None, algorithm=’auto’, leaf_size=30, n_jobs=None)[
reachability = optics.reachability_[optics.ordering_]
# Output clusters
labels = optics.labels_[optics.ordering_]
ordering = optics.ordering_
print(labels)
print(ordering)
for cl in np.unique(labels):
print('Cluster %d: ' % cl, end="")
idx_cl = np.where(labels == cl)[0]
for i in idx_cl:
print(country_labels[ordering[i]] + ' ', end="")
print('')
# Compute silhouette coefficient
# https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html
# clusterer = KMeans(n_clusters=10, random_state=10)
# cluster_labels = clusterer.fit_predict(X)
silhouette_avg = silhouette_score(Y, labels)
print('Silhouette score average: %.3f' % silhouette_avg)
if CREATE_PLOTS:
# Visualize t-SNE mapping including color-coded OPTICS clusters
datapoint_scaling = 1.0
# datapoint_scaling = 0.5
datapoint_size_min = 8 * datapoint_scaling
datapoint_size_max = 32 * datapoint_scaling
dp_max = np.max(country_le_sums)
dp_sizes = datapoint_size_min + ((country_le_sums / dp_max) * (datapoint_size_max - datapoint_size_min))
plt.figure(figsize=(11.69, 8.27))
colors = iter(cm.rainbow(np.linspace(0, 1, len(np.unique(labels)))))
colors_cl = []
colors_all = []
for cl in range(0, len(np.unique(labels))-1):
Xk = Y[optics.labels_ == cl]
c = next(colors) # Next color for every new class/label
colors_cl.append(c)
plt.scatter(Xk[:, 0], Xk[:, 1], color=c, s=datapoint_size_min * datapoint_scaling * 0.25)
#plt.plot(Xk[:, 0], Xk[:, 1], '.', color=c) #, alpha=0.3)
plt.scatter(Y[optics.labels_ == -1, 0], Y[optics.labels_ == -1, 1], marker='+', color=c, s=datapoint_size_min * datapoint_scaling, alpha=0.1)
#plt.plot(Y[optics.labels_ == -1, 0], Y[optics.labels_ == -1, 1], 'k+', alpha=0.1)
# print(colors_cl)
for i in range(0, len(ordering)):
if labels[i] == -1:
color_cur = 'b'
alpha_cur = 0.1
else:
# print(labels[i])
color_cur = colors_cl[labels[i]]
alpha_cur = 1.0
colors_all.append(color_cur)
plt.text(Y[ordering[i], 0], Y[ordering[i], 1], country_labels[ordering[i]], fontsize=dp_sizes[ordering[i]], color=color_cur, alpha=alpha_cur)
# Scale (use 2nd lowest and 2nd highest to drop outliers)
# plt.xticks(np.arange(np.sort(Y[:, 0])[1], np.sort(Y[:, 0])[-2], 1))
# plt.yticks(np.arange(np.sort(Y[:, 1])[1], np.sort(Y[:, 1])[-2], 1))
plt.title('OPTICS (cluster size min=' + str(OPTICS_MIN_CLUSTER_SIZE) + '); ' + method)
plt.axis('Off')
plt.grid(False)
plt.tight_layout()
plt.savefig(VISU_DIR + 'OPTICS (cluster size min=' + str(OPTICS_MIN_CLUSTER_SIZE) + '); ' + method + ', datapoint scaling=' + str(datapoint_scaling) + '.pdf', format='pdf', dpi=300)
plt.close()
### Perform AP clustering
if CLUSTERING_AP:
af = AffinityPropagation().fit(X) #convergence_iter=50, max_iter=1000).fit(X)
cluster_centers_indices = af.cluster_centers_indices_
labels = af.labels_
n_clusters_ = len(cluster_centers_indices)
print('Estimated number of clusters: %d' % n_clusters_)
# Visualize clustering results